Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/CMakeLists.txt110
-rw-r--r--intern/cycles/device/cpu/device.cpp64
-rw-r--r--intern/cycles/device/cpu/device.h35
-rw-r--r--intern/cycles/device/cpu/device_impl.cpp481
-rw-r--r--intern/cycles/device/cpu/device_impl.h99
-rw-r--r--intern/cycles/device/cpu/kernel.cpp61
-rw-r--r--intern/cycles/device/cpu/kernel.h111
-rw-r--r--intern/cycles/device/cpu/kernel_function.h124
-rw-r--r--intern/cycles/device/cpu/kernel_thread_globals.cpp85
-rw-r--r--intern/cycles/device/cpu/kernel_thread_globals.h57
-rw-r--r--intern/cycles/device/cuda/device.cpp (renamed from intern/cycles/device/device_cuda.cpp)51
-rw-r--r--intern/cycles/device/cuda/device.h37
-rw-r--r--intern/cycles/device/cuda/device_cuda.h270
-rw-r--r--intern/cycles/device/cuda/device_cuda_impl.cpp2714
-rw-r--r--intern/cycles/device/cuda/device_impl.cpp1370
-rw-r--r--intern/cycles/device/cuda/device_impl.h155
-rw-r--r--intern/cycles/device/cuda/graphics_interop.cpp102
-rw-r--r--intern/cycles/device/cuda/graphics_interop.h66
-rw-r--r--intern/cycles/device/cuda/kernel.cpp69
-rw-r--r--intern/cycles/device/cuda/kernel.h56
-rw-r--r--intern/cycles/device/cuda/queue.cpp220
-rw-r--r--intern/cycles/device/cuda/queue.h67
-rw-r--r--intern/cycles/device/cuda/util.cpp61
-rw-r--r--intern/cycles/device/cuda/util.h65
-rw-r--r--intern/cycles/device/device.cpp476
-rw-r--r--intern/cycles/device/device.h366
-rw-r--r--intern/cycles/device/device_cpu.cpp1680
-rw-r--r--intern/cycles/device/device_denoise.cpp88
-rw-r--r--intern/cycles/device/device_denoise.h110
-rw-r--r--intern/cycles/device/device_denoising.cpp353
-rw-r--r--intern/cycles/device/device_denoising.h197
-rw-r--r--intern/cycles/device/device_graphics_interop.cpp21
-rw-r--r--intern/cycles/device/device_graphics_interop.h55
-rw-r--r--intern/cycles/device/device_intern.h58
-rw-r--r--intern/cycles/device/device_kernel.cpp157
-rw-r--r--intern/cycles/device/device_kernel.h33
-rw-r--r--intern/cycles/device/device_memory.cpp7
-rw-r--r--intern/cycles/device/device_memory.h136
-rw-r--r--intern/cycles/device/device_multi.cpp826
-rw-r--r--intern/cycles/device/device_network.cpp812
-rw-r--r--intern/cycles/device/device_network.h490
-rw-r--r--intern/cycles/device/device_opencl.cpp245
-rw-r--r--intern/cycles/device/device_optix.cpp1936
-rw-r--r--intern/cycles/device/device_queue.cpp87
-rw-r--r--intern/cycles/device/device_queue.h113
-rw-r--r--intern/cycles/device/device_split_kernel.cpp389
-rw-r--r--intern/cycles/device/device_split_kernel.h145
-rw-r--r--intern/cycles/device/device_task.cpp182
-rw-r--r--intern/cycles/device/device_task.h188
-rw-r--r--intern/cycles/device/dummy/device.cpp (renamed from intern/cycles/device/device_dummy.cpp)24
-rw-r--r--intern/cycles/device/dummy/device.h31
-rw-r--r--intern/cycles/device/multi/device.cpp423
-rw-r--r--intern/cycles/device/multi/device.h31
-rw-r--r--intern/cycles/device/opencl/device_opencl.h658
-rw-r--r--intern/cycles/device/opencl/device_opencl_impl.cpp2113
-rw-r--r--intern/cycles/device/opencl/memory_manager.cpp264
-rw-r--r--intern/cycles/device/opencl/memory_manager.h105
-rw-r--r--intern/cycles/device/opencl/opencl_util.cpp1326
-rw-r--r--intern/cycles/device/optix/device.cpp105
-rw-r--r--intern/cycles/device/optix/device.h35
-rw-r--r--intern/cycles/device/optix/device_impl.cpp1573
-rw-r--r--intern/cycles/device/optix/device_impl.h186
-rw-r--r--intern/cycles/device/optix/queue.cpp144
-rw-r--r--intern/cycles/device/optix/queue.h39
-rw-r--r--intern/cycles/device/optix/util.h45
65 files changed, 6970 insertions, 15812 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 928249931a3..d18f4360aef 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -36,49 +36,70 @@ endif()
set(SRC
device.cpp
- device_cpu.cpp
- device_cuda.cpp
- device_denoising.cpp
- device_dummy.cpp
+ device_denoise.cpp
+ device_graphics_interop.cpp
+ device_kernel.cpp
device_memory.cpp
- device_multi.cpp
- device_opencl.cpp
- device_optix.cpp
- device_split_kernel.cpp
- device_task.cpp
+ device_queue.cpp
+)
+
+set(SRC_CPU
+ cpu/device.cpp
+ cpu/device.h
+ cpu/device_impl.cpp
+ cpu/device_impl.h
+ cpu/kernel.cpp
+ cpu/kernel.h
+ cpu/kernel_function.h
+ cpu/kernel_thread_globals.cpp
+ cpu/kernel_thread_globals.h
)
set(SRC_CUDA
- cuda/device_cuda.h
- cuda/device_cuda_impl.cpp
+ cuda/device.cpp
+ cuda/device.h
+ cuda/device_impl.cpp
+ cuda/device_impl.h
+ cuda/graphics_interop.cpp
+ cuda/graphics_interop.h
+ cuda/kernel.cpp
+ cuda/kernel.h
+ cuda/queue.cpp
+ cuda/queue.h
+ cuda/util.cpp
+ cuda/util.h
)
-set(SRC_OPENCL
- opencl/device_opencl.h
- opencl/device_opencl_impl.cpp
- opencl/memory_manager.h
- opencl/memory_manager.cpp
- opencl/opencl_util.cpp
+set(SRC_DUMMY
+ dummy/device.cpp
+ dummy/device.h
)
-if(WITH_CYCLES_NETWORK)
- list(APPEND SRC
- device_network.cpp
- )
-endif()
+set(SRC_MULTI
+ multi/device.cpp
+ multi/device.h
+)
+
+set(SRC_OPTIX
+ optix/device.cpp
+ optix/device.h
+ optix/device_impl.cpp
+ optix/device_impl.h
+ optix/queue.cpp
+ optix/queue.h
+ optix/util.h
+)
set(SRC_HEADERS
device.h
- device_denoising.h
+ device_denoise.h
+ device_graphics_interop.h
device_memory.h
- device_intern.h
- device_network.h
- device_split_kernel.h
- device_task.h
+ device_kernel.h
+ device_queue.h
)
set(LIB
- cycles_render
cycles_kernel
cycles_util
${CYCLES_GL_LIBRARIES}
@@ -95,15 +116,7 @@ else()
endif()
add_definitions(${GL_DEFINITIONS})
-if(WITH_CYCLES_NETWORK)
- add_definitions(-DWITH_NETWORK)
-endif()
-if(WITH_CYCLES_DEVICE_OPENCL)
- list(APPEND LIB
- extern_clew
- )
- add_definitions(-DWITH_OPENCL)
-endif()
+
if(WITH_CYCLES_DEVICE_CUDA)
add_definitions(-DWITH_CUDA)
endif()
@@ -115,18 +128,27 @@ if(WITH_CYCLES_DEVICE_MULTI)
endif()
if(WITH_OPENIMAGEDENOISE)
- add_definitions(-DWITH_OPENIMAGEDENOISE)
- add_definitions(-DOIDN_STATIC_LIB)
- list(APPEND INC_SYS
- ${OPENIMAGEDENOISE_INCLUDE_DIRS}
- )
list(APPEND LIB
${OPENIMAGEDENOISE_LIBRARIES}
- ${TBB_LIBRARIES}
)
endif()
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
-cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device "${LIB}"
+ ${SRC}
+ ${SRC_CPU}
+ ${SRC_CUDA}
+ ${SRC_DUMMY}
+ ${SRC_MULTI}
+ ${SRC_OPTIX}
+ ${SRC_HEADERS}
+)
+
+source_group("cpu" FILES ${SRC_CPU})
+source_group("cuda" FILES ${SRC_CUDA})
+source_group("dummy" FILES ${SRC_DUMMY})
+source_group("multi" FILES ${SRC_MULTI})
+source_group("optix" FILES ${SRC_OPTIX})
+source_group("common" FILES ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp
new file mode 100644
index 00000000000..68ca8e8bb22
--- /dev/null
+++ b/intern/cycles/device/cpu/device.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device.h"
+#include "device/cpu/device_impl.h"
+
+/* Used for `info.denoisers`. */
+/* TODO(sergey): The denoisers are probably to be moved completely out of the device into their
+ * own class. But until then keep API consistent with how it used to work before. */
+#include "util/util_openimagedenoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+ return new CPUDevice(info, stats, profiler);
+}
+
+void device_cpu_info(vector<DeviceInfo> &devices)
+{
+ DeviceInfo info;
+
+ info.type = DEVICE_CPU;
+ info.description = system_cpu_brand_string();
+ info.id = "CPU";
+ info.num = 0;
+ info.has_osl = true;
+ info.has_half_images = true;
+ info.has_nanovdb = true;
+ info.has_profiling = true;
+ if (openimagedenoise_supported()) {
+ info.denoisers |= DENOISER_OPENIMAGEDENOISE;
+ }
+
+ devices.insert(devices.begin(), info);
+}
+
+string device_cpu_capabilities()
+{
+ string capabilities = "";
+ capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
+ capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
+ capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+ capabilities += system_cpu_support_avx() ? "AVX " : "";
+ capabilities += system_cpu_support_avx2() ? "AVX2" : "";
+ if (capabilities[capabilities.size() - 1] == ' ')
+ capabilities.resize(capabilities.size() - 1);
+ return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device.h b/intern/cycles/device/cpu/device.h
new file mode 100644
index 00000000000..9cb2e80068d
--- /dev/null
+++ b/intern/cycles/device/cpu/device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cpu_info(vector<DeviceInfo> &devices);
+
+string device_cpu_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
new file mode 100644
index 00000000000..3b0db6bdd0e
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device_impl.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+# include "util/util_windows.h"
+# include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+# include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "device/device.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "bvh/bvh_embree.h"
+
+#include "render/buffers.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
+#include "util/util_optimization.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_task.h"
+#include "util/util_thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+ : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+ /* Pick any kernel, all of them are supposed to have same level of microarchitecture
+ * optimization. */
+ VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name()
+ << " kernels.";
+
+ if (info.cpu_threads == 0) {
+ info.cpu_threads = TaskScheduler::num_threads();
+ }
+
+#ifdef WITH_OSL
+ kernel_globals.osl = &osl_globals;
+#endif
+#ifdef WITH_EMBREE
+ embree_device = rtcNewDevice("verbose=0");
+#endif
+ need_texture_info = false;
+}
+
+CPUDevice::~CPUDevice()
+{
+#ifdef WITH_EMBREE
+ rtcReleaseDevice(embree_device);
+#endif
+
+ texture_info.free();
+}
+
+bool CPUDevice::show_samples() const
+{
+ return (info.cpu_threads == 1);
+}
+
+BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
+{
+ BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
+#ifdef WITH_EMBREE
+ bvh_layout_mask |= BVH_LAYOUT_EMBREE;
+#endif /* WITH_EMBREE */
+ return bvh_layout_mask;
+}
+
+bool CPUDevice::load_texture_info()
+{
+ if (!need_texture_info) {
+ return false;
+ }
+
+ texture_info.copy_to_device();
+ need_texture_info = false;
+
+ return true;
+}
+
+void CPUDevice::mem_alloc(device_memory &mem)
+{
+ if (mem.type == MEM_TEXTURE) {
+ assert(!"mem_alloc not supported for textures.");
+ }
+ else if (mem.type == MEM_GLOBAL) {
+ assert(!"mem_alloc not supported for global memory.");
+ }
+ else {
+ if (mem.name) {
+ VLOG(1) << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
+ if (mem.type == MEM_DEVICE_ONLY) {
+ assert(!mem.host_pointer);
+ size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
+ void *data = util_aligned_malloc(mem.memory_size(), alignment);
+ mem.device_pointer = (device_ptr)data;
+ }
+ else {
+ mem.device_pointer = (device_ptr)mem.host_pointer;
+ }
+
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
+ }
+}
+
+void CPUDevice::mem_copy_to(device_memory &mem)
+{
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ global_alloc(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ tex_alloc((device_texture &)mem);
+ }
+ else {
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+
+ /* copy is no-op */
+ }
+}
+
+void CPUDevice::mem_copy_from(
+ device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+{
+ /* no-op */
+}
+
+void CPUDevice::mem_zero(device_memory &mem)
+{
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+
+ if (mem.device_pointer) {
+ memset((void *)mem.device_pointer, 0, mem.memory_size());
+ }
+}
+
+void CPUDevice::mem_free(device_memory &mem)
+{
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ }
+ else if (mem.device_pointer) {
+ if (mem.type == MEM_DEVICE_ONLY) {
+ util_aligned_free((void *)mem.device_pointer);
+ }
+ mem.device_pointer = 0;
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ }
+}
+
+device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+ return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CPUDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+#if WITH_EMBREE
+ if (strcmp(name, "__data") == 0) {
+ assert(size <= sizeof(KernelData));
+
+ // Update scene handle (since it is different for each device on multi devices)
+ KernelData *const data = (KernelData *)host;
+ data->bvh.scene = embree_scene;
+ }
+#endif
+ kernel_const_copy(&kernel_globals, name, host, size);
+}
+
+void CPUDevice::global_alloc(device_memory &mem)
+{
+ VLOG(1) << "Global memory allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
+
+ mem.device_pointer = (device_ptr)mem.host_pointer;
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
+}
+
+void CPUDevice::global_free(device_memory &mem)
+{
+ if (mem.device_pointer) {
+ mem.device_pointer = 0;
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ }
+}
+
+void CPUDevice::tex_alloc(device_texture &mem)
+{
+ VLOG(1) << "Texture allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ mem.device_pointer = (device_ptr)mem.host_pointer;
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
+
+ const uint slot = mem.slot;
+ if (slot >= texture_info.size()) {
+ /* Allocate some slots in advance, to reduce amount of re-allocations. */
+ texture_info.resize(slot + 128);
+ }
+
+ texture_info[slot] = mem.info;
+ texture_info[slot].data = (uint64_t)mem.host_pointer;
+ need_texture_info = true;
+}
+
+void CPUDevice::tex_free(device_texture &mem)
+{
+ if (mem.device_pointer) {
+ mem.device_pointer = 0;
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ need_texture_info = true;
+ }
+}
+
+void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+#ifdef WITH_EMBREE
+ if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
+ bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
+ BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+ if (refit) {
+ bvh_embree->refit(progress);
+ }
+ else {
+ bvh_embree->build(progress, &stats, embree_device);
+ }
+
+ if (bvh->params.top_level) {
+ embree_scene = bvh_embree->scene;
+ }
+ }
+ else
+#endif
+ Device::build_bvh(bvh, progress, refit);
+}
+
+#if 0
+void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+{
+ const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
+
+ scoped_timer timer(&tile.buffers->render_time);
+
+ Coverage coverage(kg, tile);
+ if (use_coverage) {
+ coverage.init_path_trace();
+ }
+
+ float *render_buffer = (float *)tile.buffer;
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
+
+ /* Needed for Embree. */
+ SIMD_SET_FLUSH_TO_ZERO;
+
+ for (int sample = start_sample; sample < end_sample; sample++) {
+ if (task.get_cancel() || TaskPool::canceled()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+
+ if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
+ tile.stealing_state = RenderTile::WAS_STOLEN;
+ break;
+ }
+
+ if (tile.task == RenderTile::PATH_TRACE) {
+ for (int y = tile.y; y < tile.y + tile.h; y++) {
+ for (int x = tile.x; x < tile.x + tile.w; x++) {
+ if (use_coverage) {
+ coverage.init_pixel(x, y);
+ }
+ kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+ }
+ }
+ }
+ else {
+ for (int y = tile.y; y < tile.y + tile.h; y++) {
+ for (int x = tile.x; x < tile.x + tile.w; x++) {
+ kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+ }
+ }
+ }
+ tile.sample = sample + 1;
+
+ if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+ const bool stop = adaptive_sampling_filter(kg, tile, sample);
+ if (stop) {
+ const int num_progress_samples = end_sample - sample;
+ tile.sample = end_sample;
+ task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
+ break;
+ }
+ }
+
+ task.update_progress(&tile, tile.w * tile.h);
+ }
+ if (use_coverage) {
+ coverage.finalize();
+ }
+
+ if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
+ adaptive_sampling_post(tile, kg);
+ }
+}
+
+void CPUDevice::thread_render(DeviceTask &task)
+{
+ if (TaskPool::canceled()) {
+ if (task.need_finish_queue == false)
+ return;
+ }
+
+ /* allocate buffer for kernel globals */
+ CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory());
+
+ profiler.add_state(&kg.profiler);
+
+ /* NLM denoiser. */
+ DenoisingTask *denoising = NULL;
+
+ /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+ * avoid waiting with mutex locks in the denoiser, we let only a single
+ * thread acquire denoising tiles. */
+ uint tile_types = task.tile_types;
+ bool hold_denoise_lock = false;
+ if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+ if (!oidn_task_lock.try_lock()) {
+ tile_types &= ~RenderTile::DENOISE;
+ hold_denoise_lock = true;
+ }
+ }
+
+ RenderTile tile;
+ while (task.acquire_tile(this, tile, tile_types)) {
+ if (tile.task == RenderTile::PATH_TRACE) {
+ render(task, tile, &kg);
+ }
+ else if (tile.task == RenderTile::BAKE) {
+ render(task, tile, &kg);
+ }
+ else if (tile.task == RenderTile::DENOISE) {
+ denoise_openimagedenoise(task, tile);
+ task.update_progress(&tile, tile.w * tile.h);
+ }
+
+ task.release_tile(tile);
+
+ if (TaskPool::canceled()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+ }
+
+ if (hold_denoise_lock) {
+ oidn_task_lock.unlock();
+ }
+
+ profiler.remove_state(&kg.profiler);
+
+ delete denoising;
+}
+
+void CPUDevice::thread_denoise(DeviceTask &task)
+{
+ RenderTile tile;
+ tile.x = task.x;
+ tile.y = task.y;
+ tile.w = task.w;
+ tile.h = task.h;
+ tile.buffer = task.buffer;
+ tile.sample = task.sample + task.num_samples;
+ tile.num_samples = task.num_samples;
+ tile.start_sample = task.sample;
+ tile.offset = task.offset;
+ tile.stride = task.stride;
+ tile.buffers = task.buffers;
+
+ denoise_openimagedenoise(task, tile);
+
+ task.update_progress(&tile, tile.w * tile.h);
+}
+#endif
+
+const CPUKernels *CPUDevice::get_cpu_kernels() const
+{
+ return &kernels;
+}
+
+void CPUDevice::get_cpu_kernel_thread_globals(
+ vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+ /* Ensure latest texture info is loaded into kernel globals before returning. */
+ load_texture_info();
+
+ kernel_thread_globals.clear();
+ void *osl_memory = get_cpu_osl_memory();
+ for (int i = 0; i < info.cpu_threads; i++) {
+ kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler);
+ }
+}
+
+void *CPUDevice::get_cpu_osl_memory()
+{
+#ifdef WITH_OSL
+ return &osl_globals;
+#else
+ return NULL;
+#endif
+}
+
+bool CPUDevice::load_kernels(const uint /*kernel_features*/)
+{
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
new file mode 100644
index 00000000000..7d222808652
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+# include "util/util_windows.h"
+# include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+# include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+#include "device/device_memory.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+class CPUDevice : public Device {
+ public:
+ KernelGlobals kernel_globals;
+
+ device_vector<TextureInfo> texture_info;
+ bool need_texture_info;
+
+#ifdef WITH_OSL
+ OSLGlobals osl_globals;
+#endif
+#ifdef WITH_EMBREE
+ RTCScene embree_scene = NULL;
+ RTCDevice embree_device;
+#endif
+
+ CPUKernels kernels;
+
+ CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
+ ~CPUDevice();
+
+ virtual bool show_samples() const override;
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+ /* Returns true if the texture info was copied to the device (meaning, some more
+ * re-initialization might be needed). */
+ bool load_texture_info();
+
+ virtual void mem_alloc(device_memory &mem) override;
+ virtual void mem_copy_to(device_memory &mem) override;
+ virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+ virtual void mem_zero(device_memory &mem) override;
+ virtual void mem_free(device_memory &mem) override;
+ virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+ virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+ void global_alloc(device_memory &mem);
+ void global_free(device_memory &mem);
+
+ void tex_alloc(device_texture &mem);
+ void tex_free(device_texture &mem);
+
+ void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+ virtual const CPUKernels *get_cpu_kernels() const override;
+ virtual void get_cpu_kernel_thread_globals(
+ vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
+ virtual void *get_cpu_osl_memory() override;
+
+ protected:
+ virtual bool load_kernels(uint /*kernel_features*/) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..91282390e27
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel.h"
+
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_FUNCTIONS(name) \
+ KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
+ KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
+ KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+
+#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+
+CPUKernels::CPUKernels()
+ : /* Integrator. */
+ REGISTER_KERNEL(integrator_init_from_camera),
+ REGISTER_KERNEL(integrator_init_from_bake),
+ REGISTER_KERNEL(integrator_intersect_closest),
+ REGISTER_KERNEL(integrator_intersect_shadow),
+ REGISTER_KERNEL(integrator_intersect_subsurface),
+ REGISTER_KERNEL(integrator_intersect_volume_stack),
+ REGISTER_KERNEL(integrator_shade_background),
+ REGISTER_KERNEL(integrator_shade_light),
+ REGISTER_KERNEL(integrator_shade_shadow),
+ REGISTER_KERNEL(integrator_shade_surface),
+ REGISTER_KERNEL(integrator_shade_volume),
+ REGISTER_KERNEL(integrator_megakernel),
+ /* Shader evaluation. */
+ REGISTER_KERNEL(shader_eval_displace),
+ REGISTER_KERNEL(shader_eval_background),
+ /* Adaptive sampling. */
+ REGISTER_KERNEL(adaptive_sampling_convergence_check),
+ REGISTER_KERNEL(adaptive_sampling_filter_x),
+ REGISTER_KERNEL(adaptive_sampling_filter_y),
+ /* Cryptomatte. */
+ REGISTER_KERNEL(cryptomatte_postprocess),
+ /* Bake. */
+ REGISTER_KERNEL(bake)
+{
+}
+
+#undef REGISTER_KERNEL
+#undef KERNEL_FUNCTIONS
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
new file mode 100644
index 00000000000..54b18308544
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/cpu/kernel_function.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelGlobals;
+struct IntegratorStateCPU;
+struct TileInfo;
+
+class CPUKernels {
+ public:
+ /* Integrator. */
+
+ using IntegratorFunction =
+ CPUKernelFunction<void (*)(const KernelGlobals *kg, IntegratorStateCPU *state)>;
+ using IntegratorShadeFunction = CPUKernelFunction<void (*)(
+ const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>;
+ using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+ IntegratorStateCPU *state,
+ KernelWorkTile *tile,
+ ccl_global float *render_buffer)>;
+
+ IntegratorInitFunction integrator_init_from_camera;
+ IntegratorInitFunction integrator_init_from_bake;
+ IntegratorFunction integrator_intersect_closest;
+ IntegratorFunction integrator_intersect_shadow;
+ IntegratorFunction integrator_intersect_subsurface;
+ IntegratorFunction integrator_intersect_volume_stack;
+ IntegratorShadeFunction integrator_shade_background;
+ IntegratorShadeFunction integrator_shade_light;
+ IntegratorShadeFunction integrator_shade_shadow;
+ IntegratorShadeFunction integrator_shade_surface;
+ IntegratorShadeFunction integrator_shade_volume;
+ IntegratorShadeFunction integrator_megakernel;
+
+ /* Shader evaluation. */
+
+ using ShaderEvalFunction = CPUKernelFunction<void (*)(
+ const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>;
+
+ ShaderEvalFunction shader_eval_displace;
+ ShaderEvalFunction shader_eval_background;
+
+ /* Adaptive stopping. */
+
+ using AdaptiveSamplingConvergenceCheckFunction =
+ CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride)>;
+
+ using AdaptiveSamplingFilterXFunction =
+ CPUKernelFunction<void (*)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride)>;
+
+ using AdaptiveSamplingFilterYFunction =
+ CPUKernelFunction<void (*)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride)>;
+
+ AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check;
+
+ AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x;
+ AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y;
+
+ /* Cryptomatte. */
+
+ using CryptomattePostprocessFunction = CPUKernelFunction<void (*)(
+ const KernelGlobals *kg, ccl_global float *render_buffer, int pixel_index)>;
+
+ CryptomattePostprocessFunction cryptomatte_postprocess;
+
+ /* Bake. */
+
+ CPUKernelFunction<void (*)(const KernelGlobals *, float *, int, int, int, int, int)> bake;
+
+ CPUKernels();
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h
new file mode 100644
index 00000000000..aa18720cc24
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_function.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_debug.h"
+#include "util/util_system.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* A wrapper around per-microarchitecture variant of a kernel function.
+ *
+ * Provides a function-call-like API which gets routed to the most suitable implementation.
+ *
+ * For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */
+template<typename FunctionType> class CPUKernelFunction {
+ public:
+ CPUKernelFunction(FunctionType kernel_default,
+ FunctionType kernel_sse2,
+ FunctionType kernel_sse3,
+ FunctionType kernel_sse41,
+ FunctionType kernel_avx,
+ FunctionType kernel_avx2)
+ {
+ kernel_info_ = get_best_kernel_info(
+ kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2);
+ }
+
+ template<typename... Args> inline auto operator()(Args... args) const
+ {
+ assert(kernel_info_.kernel);
+
+ return kernel_info_.kernel(args...);
+ }
+
+ const char *get_uarch_name() const
+ {
+ return kernel_info_.uarch_name;
+ }
+
+ protected:
+ /* Helper class which allows to pass human-readable microarchitecture name together with function
+ * pointer. */
+ class KernelInfo {
+ public:
+ KernelInfo() : KernelInfo("", nullptr)
+ {
+ }
+
+ /* TODO(sergey): Use string view, to have higher-level functionality (i.e. comparison) without
+ * memory allocation. */
+ KernelInfo(const char *uarch_name, FunctionType kernel)
+ : uarch_name(uarch_name), kernel(kernel)
+ {
+ }
+
+ const char *uarch_name;
+ FunctionType kernel;
+ };
+
+ KernelInfo get_best_kernel_info(FunctionType kernel_default,
+ FunctionType kernel_sse2,
+ FunctionType kernel_sse3,
+ FunctionType kernel_sse41,
+ FunctionType kernel_avx,
+ FunctionType kernel_avx2)
+ {
+ /* Silence warnings about unused variables when compiling without some architectures. */
+ (void)kernel_sse2;
+ (void)kernel_sse3;
+ (void)kernel_sse41;
+ (void)kernel_avx;
+ (void)kernel_avx2;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+ return KernelInfo("AVX2", kernel_avx2);
+ }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+ if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
+ return KernelInfo("AVX", kernel_avx);
+ }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+ if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
+ return KernelInfo("SSE4.1", kernel_sse41);
+ }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+ if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
+ return KernelInfo("SSE3", kernel_sse3);
+ }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+ return KernelInfo("SSE2", kernel_sse2);
+ }
+#endif
+
+ return KernelInfo("default", kernel_default);
+ }
+
+ KernelInfo kernel_info_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp
new file mode 100644
index 00000000000..988b00cd1f0
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel_thread_globals.h"
+
+// clang-format off
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "util/util_profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+ void *osl_globals_memory,
+ Profiler &cpu_profiler)
+ : KernelGlobals(kernel_globals), cpu_profiler_(cpu_profiler)
+{
+ reset_runtime_memory();
+
+#ifdef WITH_OSL
+ OSLShader::thread_init(this, reinterpret_cast<OSLGlobals *>(osl_globals_memory));
+#else
+ (void)osl_globals_memory;
+#endif
+}
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept
+ : KernelGlobals(std::move(other)), cpu_profiler_(other.cpu_profiler_)
+{
+ other.reset_runtime_memory();
+}
+
+CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
+{
+#ifdef WITH_OSL
+ OSLShader::thread_free(this);
+#endif
+}
+
+CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other)
+{
+ if (this == &other) {
+ return *this;
+ }
+
+ *static_cast<KernelGlobals *>(this) = *static_cast<KernelGlobals *>(&other);
+
+ other.reset_runtime_memory();
+
+ return *this;
+}
+
+void CPUKernelThreadGlobals::reset_runtime_memory()
+{
+#ifdef WITH_OSL
+ osl = nullptr;
+#endif
+}
+
+void CPUKernelThreadGlobals::start_profiling()
+{
+ cpu_profiler_.add_state(&profiler);
+}
+
+void CPUKernelThreadGlobals::stop_profiling()
+{
+ cpu_profiler_.remove_state(&profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.h b/intern/cycles/device/cpu/kernel_thread_globals.h
new file mode 100644
index 00000000000..d005c3bb56c
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Profiler;
+
+/* A special class which extends memory ownership of the `KernelGlobals` decoupling any resource
+ * which is not thread-safe for access. Every worker thread which needs to operate on
+ * `KernelGlobals` needs to initialize its own copy of this object.
+ *
+ * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that
+ * there is no unnecessary data duplication happening when using this object. */
+class CPUKernelThreadGlobals : public KernelGlobals {
+ public:
+ /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building
+ * without OSL support. Will avoid need to those unnamed pointers and casts. */
+ CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+ void *osl_globals_memory,
+ Profiler &cpu_profiler);
+
+ ~CPUKernelThreadGlobals();
+
+ CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete;
+ CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept;
+
+ CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete;
+ CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other);
+
+ void start_profiling();
+ void stop_profiling();
+
+ protected:
+ void reset_runtime_memory();
+
+ Profiler &cpu_profiler_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/cuda/device.cpp
index 2e225ecfaf8..84becd6d081 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -14,21 +14,25 @@
* limitations under the License.
*/
-#ifdef WITH_CUDA
+#include "device/cuda/device.h"
+
+#include "util/util_logging.h"
-# include "device/cuda/device_cuda.h"
+#ifdef WITH_CUDA
+# include "device/cuda/device_impl.h"
# include "device/device.h"
-# include "device/device_intern.h"
-# include "util/util_logging.h"
# include "util/util_string.h"
# include "util/util_windows.h"
+#endif /* WITH_CUDA */
CCL_NAMESPACE_BEGIN
bool device_cuda_init()
{
-# ifdef WITH_CUDA_DYNLOAD
+#if !defined(WITH_CUDA)
+ return false;
+#elif defined(WITH_CUDA_DYNLOAD)
static bool initialized = false;
static bool result = false;
@@ -59,16 +63,27 @@ bool device_cuda_init()
}
return result;
-# else /* WITH_CUDA_DYNLOAD */
+#else /* WITH_CUDA_DYNLOAD */
return true;
-# endif /* WITH_CUDA_DYNLOAD */
+#endif /* WITH_CUDA_DYNLOAD */
}
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
{
- return new CUDADevice(info, stats, profiler, background);
+#ifdef WITH_CUDA
+ return new CUDADevice(info, stats, profiler);
+#else
+ (void)info;
+ (void)stats;
+ (void)profiler;
+
+ LOG(FATAL) << "Request to create CUDA device without compiled-in support. Should never happen.";
+
+ return nullptr;
+#endif
}
+#ifdef WITH_CUDA
static CUresult device_cuda_safe_init()
{
# ifdef _WIN32
@@ -86,9 +101,11 @@ static CUresult device_cuda_safe_init()
return cuInit(0);
# endif
}
+#endif /* WITH_CUDA */
void device_cuda_info(vector<DeviceInfo> &devices)
{
+#ifdef WITH_CUDA
CUresult result = device_cuda_safe_init();
if (result != CUDA_SUCCESS) {
if (result != CUDA_ERROR_NO_DEVICE)
@@ -129,9 +146,9 @@ void device_cuda_info(vector<DeviceInfo> &devices)
info.has_half_images = (major >= 3);
info.has_nanovdb = true;
- info.has_volume_decoupled = false;
- info.has_adaptive_stop_per_sample = false;
- info.denoisers = DENOISER_NLM;
+ info.denoisers = 0;
+
+ info.has_gpu_queue = true;
/* Check if the device has P2P access to any other device in the system. */
for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
@@ -182,10 +199,14 @@ void device_cuda_info(vector<DeviceInfo> &devices)
if (!display_devices.empty())
devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else /* WITH_CUDA */
+ (void)devices;
+#endif /* WITH_CUDA */
}
string device_cuda_capabilities()
{
+#ifdef WITH_CUDA
CUresult result = device_cuda_safe_init();
if (result != CUDA_SUCCESS) {
if (result != CUDA_ERROR_NO_DEVICE) {
@@ -310,8 +331,10 @@ string device_cuda_capabilities()
}
return capabilities;
+
+#else /* WITH_CUDA */
+ return "";
+#endif /* WITH_CUDA */
}
CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device.h b/intern/cycles/device/cuda/device.h
new file mode 100644
index 00000000000..b0484904d1a
--- /dev/null
+++ b/intern/cycles/device/cuda/device.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_cuda_init();
+
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cuda_info(vector<DeviceInfo> &devices);
+
+string device_cuda_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
deleted file mode 100644
index c3271c3cfcf..00000000000
--- a/intern/cycles/device/cuda/device_cuda.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-# include "device/device.h"
-# include "device/device_denoising.h"
-# include "device/device_split_kernel.h"
-
-# include "util/util_map.h"
-# include "util/util_task.h"
-
-# ifdef WITH_CUDA_DYNLOAD
-# include "cuew.h"
-# else
-# include "util/util_opengl.h"
-# include <cuda.h>
-# include <cudaGL.h>
-# endif
-
-CCL_NAMESPACE_BEGIN
-
-class CUDASplitKernel;
-
-class CUDADevice : public Device {
-
- friend class CUDASplitKernelFunction;
- friend class CUDASplitKernel;
- friend class CUDAContextScope;
-
- public:
- DedicatedTaskPool task_pool;
- CUdevice cuDevice;
- CUcontext cuContext;
- CUmodule cuModule, cuFilterModule;
- size_t device_texture_headroom;
- size_t device_working_headroom;
- bool move_texture_to_host;
- size_t map_host_used;
- size_t map_host_limit;
- int can_map_host;
- int pitch_alignment;
- int cuDevId;
- int cuDevArchitecture;
- bool first_error;
- CUDASplitKernel *split_kernel;
-
- struct CUDAMem {
- CUDAMem() : texobject(0), array(0), use_mapped_host(false)
- {
- }
-
- CUtexObject texobject;
- CUarray array;
-
- /* If true, a mapped host memory in shared_pointer is being used. */
- bool use_mapped_host;
- };
- typedef map<device_memory *, CUDAMem> CUDAMemMap;
- CUDAMemMap cuda_mem_map;
- thread_mutex cuda_mem_map_mutex;
-
- struct PixelMem {
- GLuint cuPBO;
- CUgraphicsResource cuPBOresource;
- GLuint cuTexId;
- int w, h;
- };
- map<device_ptr, PixelMem> pixel_mem_map;
-
- /* Bindless Textures */
- device_vector<TextureInfo> texture_info;
- bool need_texture_info;
-
- /* Kernels */
- struct {
- bool loaded;
-
- CUfunction adaptive_stopping;
- CUfunction adaptive_filter_x;
- CUfunction adaptive_filter_y;
- CUfunction adaptive_scale_samples;
- int adaptive_num_threads_per_block;
- } functions;
-
- static bool have_precompiled_kernels();
-
- virtual bool show_samples() const override;
-
- virtual BVHLayoutMask get_bvh_layout_mask() const override;
-
- void set_error(const string &error) override;
-
- CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
-
- virtual ~CUDADevice();
-
- bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
-
- bool check_peer_access(Device *peer_device) override;
-
- bool use_adaptive_compilation();
-
- bool use_split_kernel();
-
- virtual string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false);
-
- string compile_kernel(const DeviceRequestedFeatures &requested_features,
- const char *name,
- const char *base = "cuda",
- bool force_ptx = false);
-
- virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override;
-
- void load_functions();
-
- void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
-
- void init_host_memory();
-
- void load_texture_info();
-
- void move_textures_to_host(size_t size, bool for_texture);
-
- CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
- void generic_copy_to(device_memory &mem);
-
- void generic_free(device_memory &mem);
-
- void mem_alloc(device_memory &mem) override;
-
- void mem_copy_to(device_memory &mem) override;
-
- void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
-
- void mem_zero(device_memory &mem) override;
-
- void mem_free(device_memory &mem) override;
-
- device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
-
- virtual void const_copy_to(const char *name, void *host, size_t size) override;
-
- void global_alloc(device_memory &mem);
-
- void global_free(device_memory &mem);
-
- void tex_alloc(device_texture &mem);
-
- void tex_free(device_texture &mem);
-
- bool denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task);
-
- bool denoising_construct_transform(DenoisingTask *task);
-
- bool denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task);
-
- bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-
- bool denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect,
- DenoisingTask *task);
-
- bool denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task);
-
- bool denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task);
-
- bool denoising_write_feature(int out_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task);
-
- bool denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task);
-
- void denoise(RenderTile &rtile, DenoisingTask &denoising);
-
- void adaptive_sampling_filter(uint filter_sample,
- WorkTile *wtile,
- CUdeviceptr d_wtile,
- CUstream stream = 0);
- void adaptive_sampling_post(RenderTile &rtile,
- WorkTile *wtile,
- CUdeviceptr d_wtile,
- CUstream stream = 0);
-
- void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
-
- void film_convert(DeviceTask &task,
- device_ptr buffer,
- device_ptr rgba_byte,
- device_ptr rgba_half);
-
- void shader(DeviceTask &task);
-
- CUdeviceptr map_pixels(device_ptr mem);
-
- void unmap_pixels(device_ptr mem);
-
- void pixels_alloc(device_memory &mem);
-
- void pixels_copy_from(device_memory &mem, int y, int w, int h);
-
- void pixels_free(device_memory &mem);
-
- void draw_pixels(device_memory &mem,
- int y,
- int w,
- int h,
- int width,
- int height,
- int dx,
- int dy,
- int dw,
- int dh,
- bool transparent,
- const DeviceDrawParams &draw_params) override;
-
- void thread_run(DeviceTask &task);
-
- virtual void task_add(DeviceTask &task) override;
-
- virtual void task_wait() override;
-
- virtual void task_cancel() override;
-};
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
deleted file mode 100644
index 2d2fcb38705..00000000000
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ /dev/null
@@ -1,2714 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-# include <climits>
-# include <limits.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <string.h>
-
-# include "device/cuda/device_cuda.h"
-# include "device/device_intern.h"
-# include "device/device_split_kernel.h"
-
-# include "render/buffers.h"
-
-# include "kernel/filter/filter_defines.h"
-
-# include "util/util_debug.h"
-# include "util/util_foreach.h"
-# include "util/util_logging.h"
-# include "util/util_map.h"
-# include "util/util_md5.h"
-# include "util/util_opengl.h"
-# include "util/util_path.h"
-# include "util/util_string.h"
-# include "util/util_system.h"
-# include "util/util_time.h"
-# include "util/util_types.h"
-# include "util/util_windows.h"
-
-# include "kernel/split/kernel_split_data_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-# ifndef WITH_CUDA_DYNLOAD
-
-/* Transparently implement some functions, so majority of the file does not need
- * to worry about difference between dynamically loaded and linked CUDA at all.
- */
-
-namespace {
-
-const char *cuewErrorString(CUresult result)
-{
- /* We can only give error code here without major code duplication, that
- * should be enough since dynamic loading is only being disabled by folks
- * who knows what they're doing anyway.
- *
- * NOTE: Avoid call from several threads.
- */
- static string error;
- error = string_printf("%d", result);
- return error.c_str();
-}
-
-const char *cuewCompilerPath()
-{
- return CYCLES_CUDA_NVCC_EXECUTABLE;
-}
-
-int cuewCompilerVersion()
-{
- return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
-}
-
-} /* namespace */
-# endif /* WITH_CUDA_DYNLOAD */
-
-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
- CUDADevice *device;
-
- public:
- explicit CUDASplitKernel(CUDADevice *device);
-
- virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory &kernel_globals,
- device_memory &kernel_data_,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flag,
- device_memory &work_pool_wgs);
-
- virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
- const DeviceRequestedFeatures &);
- virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-};
-
-/* Utility to push/pop CUDA context. */
-class CUDAContextScope {
- public:
- CUDAContextScope(CUDADevice *device);
- ~CUDAContextScope();
-
- private:
- CUDADevice *device;
-};
-
-bool CUDADevice::have_precompiled_kernels()
-{
- string cubins_path = path_get("lib");
- return path_exists(cubins_path);
-}
-
-bool CUDADevice::show_samples() const
-{
- /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
- return true;
-}
-
-BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
-{
- return BVH_LAYOUT_BVH2;
-}
-
-void CUDADevice::set_error(const string &error)
-{
- Device::set_error(error);
-
- if (first_error) {
- fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
- fprintf(stderr,
- "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
- first_error = false;
- }
-}
-
-# define cuda_assert(stmt) \
- { \
- CUresult result = stmt; \
- if (result != CUDA_SUCCESS) { \
- const char *name = cuewErrorString(result); \
- set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
- } \
- } \
- (void)0
-
-CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
- : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
-{
- first_error = true;
- background = background_;
-
- cuDevId = info.num;
- cuDevice = 0;
- cuContext = 0;
-
- cuModule = 0;
- cuFilterModule = 0;
-
- split_kernel = NULL;
-
- need_texture_info = false;
-
- device_texture_headroom = 0;
- device_working_headroom = 0;
- move_texture_to_host = false;
- map_host_limit = 0;
- map_host_used = 0;
- can_map_host = 0;
- pitch_alignment = 0;
-
- functions.loaded = false;
-
- /* Initialize CUDA. */
- CUresult result = cuInit(0);
- if (result != CUDA_SUCCESS) {
- set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
- return;
- }
-
- /* Setup device and context. */
- result = cuDeviceGet(&cuDevice, cuDevId);
- if (result != CUDA_SUCCESS) {
- set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
- cuewErrorString(result)));
- return;
- }
-
- /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
- * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
- * so we can predict which memory to map to host. */
- cuda_assert(
- cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
- cuda_assert(cuDeviceGetAttribute(
- &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-
- unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
- if (can_map_host) {
- ctx_flags |= CU_CTX_MAP_HOST;
- init_host_memory();
- }
-
- /* Create context. */
- if (background) {
- result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
- }
- else {
- result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
- if (result != CUDA_SUCCESS) {
- result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
- background = true;
- }
- }
-
- if (result != CUDA_SUCCESS) {
- set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
- return;
- }
-
- int major, minor;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
- cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
- cuDevArchitecture = major * 100 + minor * 10;
-
- /* Pop context set by cuCtxCreate. */
- cuCtxPopCurrent(NULL);
-}
-
-CUDADevice::~CUDADevice()
-{
- task_pool.cancel();
-
- delete split_kernel;
-
- texture_info.free();
-
- cuda_assert(cuCtxDestroy(cuContext));
-}
-
-bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
-{
- int major, minor;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
- cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
- /* We only support sm_30 and above */
- if (major < 3) {
- set_error(string_printf(
- "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
- return false;
- }
-
- return true;
-}
-
-bool CUDADevice::check_peer_access(Device *peer_device)
-{
- if (peer_device == this) {
- return false;
- }
- if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
- return false;
- }
-
- CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
-
- int can_access = 0;
- cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
- if (can_access == 0) {
- return false;
- }
-
- // Ensure array access over the link is possible as well (for 3D textures)
- cuda_assert(cuDeviceGetP2PAttribute(&can_access,
- CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
- cuDevice,
- peer_device_cuda->cuDevice));
- if (can_access == 0) {
- return false;
- }
-
- // Enable peer access in both directions
- {
- const CUDAContextScope scope(this);
- CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
- if (result != CUDA_SUCCESS) {
- set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
- cuewErrorString(result)));
- return false;
- }
- }
- {
- const CUDAContextScope scope(peer_device_cuda);
- CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
- if (result != CUDA_SUCCESS) {
- set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
- cuewErrorString(result)));
- return false;
- }
- }
-
- return true;
-}
-
-bool CUDADevice::use_adaptive_compilation()
-{
- return DebugFlags().cuda.adaptive_compile;
-}
-
-bool CUDADevice::use_split_kernel()
-{
- return DebugFlags().cuda.split_kernel;
-}
-
-/* Common NVCC flags which stays the same regardless of shading model,
- * kernel sources md5 and only depends on compiler or compilation settings.
- */
-string CUDADevice::compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures &requested_features, bool filter, bool split)
-{
- const int machine = system_cpu_bits();
- const string source_path = path_get("source");
- const string include_path = source_path;
- string cflags = string_printf(
- "-m%d "
- "--ptxas-options=\"-v\" "
- "--use_fast_math "
- "-DNVCC "
- "-I\"%s\"",
- machine,
- include_path.c_str());
- if (!filter && use_adaptive_compilation()) {
- cflags += " " + requested_features.get_build_options();
- }
- const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
- if (extra_cflags) {
- cflags += string(" ") + string(extra_cflags);
- }
-
- if (split) {
- cflags += " -D__SPLIT__";
- }
-
-# ifdef WITH_NANOVDB
- cflags += " -DWITH_NANOVDB";
-# endif
-
- return cflags;
-}
-
-string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
- const char *name,
- const char *base,
- bool force_ptx)
-{
- /* Compute kernel name. */
- int major, minor;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
- cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
- /* Attempt to use kernel provided with Blender. */
- if (!use_adaptive_compilation()) {
- if (!force_ptx) {
- const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
- VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
- if (path_exists(cubin)) {
- VLOG(1) << "Using precompiled kernel.";
- return cubin;
- }
- }
-
- /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
- int ptx_major = major, ptx_minor = minor;
- while (ptx_major >= 3) {
- const string ptx = path_get(
- string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
- VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
- if (path_exists(ptx)) {
- VLOG(1) << "Using precompiled kernel.";
- return ptx;
- }
-
- if (ptx_minor > 0) {
- ptx_minor--;
- }
- else {
- ptx_major--;
- ptx_minor = 9;
- }
- }
- }
-
- /* Try to use locally compiled kernel. */
- string source_path = path_get("source");
- const string source_md5 = path_files_md5_hash(source_path);
-
- /* We include cflags into md5 so changing cuda toolkit or changing other
- * compiler command line arguments makes sure cubin gets re-built.
- */
- string common_cflags = compile_kernel_get_common_cflags(
- requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
- const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
-
- const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
- const char *const kernel_arch = force_ptx ? "compute" : "sm";
- const string cubin_file = string_printf(
- "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
- const string cubin = path_cache_get(path_join("kernels", cubin_file));
- VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
- if (path_exists(cubin)) {
- VLOG(1) << "Using locally compiled kernel.";
- return cubin;
- }
-
-# ifdef _WIN32
- if (!use_adaptive_compilation() && have_precompiled_kernels()) {
- if (major < 3) {
- set_error(
- string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
- "Your GPU is not supported.",
- major,
- minor));
- }
- else {
- set_error(
- string_printf("CUDA binary kernel for this graphics card compute "
- "capability (%d.%d) not found.",
- major,
- minor));
- }
- return string();
- }
-# endif
-
- /* Compile. */
- const char *const nvcc = cuewCompilerPath();
- if (nvcc == NULL) {
- set_error(
- "CUDA nvcc compiler not found. "
- "Install CUDA toolkit in default location.");
- return string();
- }
-
- const int nvcc_cuda_version = cuewCompilerVersion();
- VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
- if (nvcc_cuda_version < 101) {
- printf(
- "Unsupported CUDA version %d.%d detected, "
- "you need CUDA 10.1 or newer.\n",
- nvcc_cuda_version / 10,
- nvcc_cuda_version % 10);
- return string();
- }
- else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
- nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
- printf(
- "CUDA version %d.%d detected, build may succeed but only "
- "CUDA 10.1 to 11.4 are officially supported.\n",
- nvcc_cuda_version / 10,
- nvcc_cuda_version % 10);
- }
-
- double starttime = time_dt();
-
- path_create_directories(cubin);
-
- source_path = path_join(path_join(source_path, "kernel"),
- path_join("kernels", path_join(base, string_printf("%s.cu", name))));
-
- string command = string_printf(
- "\"%s\" "
- "-arch=%s_%d%d "
- "--%s \"%s\" "
- "-o \"%s\" "
- "%s",
- nvcc,
- kernel_arch,
- major,
- minor,
- kernel_ext,
- source_path.c_str(),
- cubin.c_str(),
- common_cflags.c_str());
-
- printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
-
-# ifdef _WIN32
- command = "call " + command;
-# endif
- if (system(command.c_str()) != 0) {
- set_error(
- "Failed to execute compilation command, "
- "see console for details.");
- return string();
- }
-
- /* Verify if compilation succeeded */
- if (!path_exists(cubin)) {
- set_error(
- "CUDA kernel compilation failed, "
- "see console for details.");
- return string();
- }
-
- printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
- return cubin;
-}
-
-bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
- /* TODO(sergey): Support kernels re-load for CUDA devices.
- *
- * Currently re-loading kernel will invalidate memory pointers,
- * causing problems in cuCtxSynchronize.
- */
- if (cuFilterModule && cuModule) {
- VLOG(1) << "Skipping kernel reload, not currently supported.";
- return true;
- }
-
- /* check if cuda init succeeded */
- if (cuContext == 0)
- return false;
-
- /* check if GPU is supported */
- if (!support_device(requested_features))
- return false;
-
- /* get kernel */
- const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
- string cubin = compile_kernel(requested_features, kernel_name);
- if (cubin.empty())
- return false;
-
- const char *filter_name = "filter";
- string filter_cubin = compile_kernel(requested_features, filter_name);
- if (filter_cubin.empty())
- return false;
-
- /* open module */
- CUDAContextScope scope(this);
-
- string cubin_data;
- CUresult result;
-
- if (path_read_text(cubin, cubin_data))
- result = cuModuleLoadData(&cuModule, cubin_data.c_str());
- else
- result = CUDA_ERROR_FILE_NOT_FOUND;
-
- if (result != CUDA_SUCCESS)
- set_error(string_printf(
- "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
-
- if (path_read_text(filter_cubin, cubin_data))
- result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
- else
- result = CUDA_ERROR_FILE_NOT_FOUND;
-
- if (result != CUDA_SUCCESS)
- set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
- filter_cubin.c_str(),
- cuewErrorString(result)));
-
- if (result == CUDA_SUCCESS) {
- reserve_local_memory(requested_features);
- }
-
- load_functions();
-
- return (result == CUDA_SUCCESS);
-}
-
-void CUDADevice::load_functions()
-{
- /* TODO: load all functions here. */
- if (functions.loaded) {
- return;
- }
- functions.loaded = true;
-
- cuda_assert(cuModuleGetFunction(
- &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
- cuda_assert(cuModuleGetFunction(
- &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
- cuda_assert(cuModuleGetFunction(
- &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
- cuda_assert(cuModuleGetFunction(
- &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
-
- cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
-
- int unused_min_blocks;
- cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
- &functions.adaptive_num_threads_per_block,
- functions.adaptive_scale_samples,
- NULL,
- 0,
- 0));
-}
-
-void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
-{
- if (use_split_kernel()) {
- /* Split kernel mostly uses global memory and adaptive compilation,
- * difficult to predict how much is needed currently. */
- return;
- }
-
- /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
- * needed for kernel launches, so that we can reliably figure out when
- * to allocate scene data in mapped host memory. */
- CUDAContextScope scope(this);
-
- size_t total = 0, free_before = 0, free_after = 0;
- cuMemGetInfo(&free_before, &total);
-
- /* Get kernel function. */
- CUfunction cuRender;
-
- if (requested_features.use_baking) {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
- }
- else if (requested_features.use_integrator_branched) {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
- }
-
- cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
- int min_blocks, num_threads_per_block;
- cuda_assert(
- cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-
- /* Launch kernel, using just 1 block appears sufficient to reserve
- * memory for all multiprocessors. It would be good to do this in
- * parallel for the multi GPU case still to make it faster. */
- CUdeviceptr d_work_tiles = 0;
- uint total_work_size = 0;
-
- void *args[] = {&d_work_tiles, &total_work_size};
-
- cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
- cuda_assert(cuCtxSynchronize());
-
- cuMemGetInfo(&free_after, &total);
- VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
- << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
-
-# if 0
- /* For testing mapped host memory, fill up device memory. */
- const size_t keep_mb = 1024;
-
- while (free_after > keep_mb * 1024 * 1024LL) {
- CUdeviceptr tmp;
- cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
- cuMemGetInfo(&free_after, &total);
- }
-# endif
-}
-
-void CUDADevice::init_host_memory()
-{
- /* Limit amount of host mapped memory, because allocating too much can
- * cause system instability. Leave at least half or 4 GB of system
- * memory free, whichever is smaller. */
- size_t default_limit = 4 * 1024 * 1024 * 1024LL;
- size_t system_ram = system_physical_ram();
-
- if (system_ram > 0) {
- if (system_ram / 2 > default_limit) {
- map_host_limit = system_ram - default_limit;
- }
- else {
- map_host_limit = system_ram / 2;
- }
- }
- else {
- VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
- map_host_limit = 0;
- }
-
- /* Amount of device memory to keep is free after texture memory
- * and working memory allocations respectively. We set the working
- * memory limit headroom lower so that some space is left after all
- * texture memory allocations. */
- device_working_headroom = 32 * 1024 * 1024LL; // 32MB
- device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
-
- VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
- << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
- if (need_texture_info) {
- /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
- * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
- need_texture_info = false;
- texture_info.copy_to_device();
- }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
- /* Break out of recursive call, which can happen when moving memory on a multi device. */
- static bool any_device_moving_textures_to_host = false;
- if (any_device_moving_textures_to_host) {
- return;
- }
-
- /* Signal to reallocate textures in host memory only. */
- move_texture_to_host = true;
-
- while (size > 0) {
- /* Find suitable memory allocation to move. */
- device_memory *max_mem = NULL;
- size_t max_size = 0;
- bool max_is_image = false;
-
- thread_scoped_lock lock(cuda_mem_map_mutex);
- foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
- device_memory &mem = *pair.first;
- CUDAMem *cmem = &pair.second;
-
- /* Can only move textures allocated on this device (and not those from peer devices).
- * And need to ignore memory that is already on the host. */
- if (!mem.is_resident(this) || cmem->use_mapped_host) {
- continue;
- }
-
- bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
- (&mem != &texture_info);
- bool is_image = is_texture && (mem.data_height > 1);
-
- /* Can't move this type of memory. */
- if (!is_texture || cmem->array) {
- continue;
- }
-
- /* For other textures, only move image textures. */
- if (for_texture && !is_image) {
- continue;
- }
-
- /* Try to move largest allocation, prefer moving images. */
- if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
- max_is_image = is_image;
- max_size = mem.device_size;
- max_mem = &mem;
- }
- }
- lock.unlock();
-
- /* Move to host memory. This part is mutex protected since
- * multiple CUDA devices could be moving the memory. The
- * first one will do it, and the rest will adopt the pointer. */
- if (max_mem) {
- VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
- static thread_mutex move_mutex;
- thread_scoped_lock lock(move_mutex);
-
- any_device_moving_textures_to_host = true;
-
- /* Potentially need to call back into multi device, so pointer mapping
- * and peer devices are updated. This is also necessary since the device
- * pointer may just be a key here, so cannot be accessed and freed directly.
- * Unfortunately it does mean that memory is reallocated on all other
- * devices as well, which is potentially dangerous when still in use (since
- * a thread rendering on another devices would only be caught in this mutex
- * if it so happens to do an allocation at the same time as well. */
- max_mem->device_copy_to();
- size = (max_size >= size) ? 0 : size - max_size;
-
- any_device_moving_textures_to_host = false;
- }
- else {
- break;
- }
- }
-
- /* Unset flag before texture info is reloaded, since it should stay in device memory. */
- move_texture_to_host = false;
-
- /* Update texture info array with new pointers. */
- load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
-{
- CUDAContextScope scope(this);
-
- CUdeviceptr device_pointer = 0;
- size_t size = mem.memory_size() + pitch_padding;
-
- CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
- const char *status = "";
-
- /* First try allocating in device memory, respecting headroom. We make
- * an exception for texture info. It is small and frequently accessed,
- * so treat it as working memory.
- *
- * If there is not enough room for working memory, we will try to move
- * textures to host memory, assuming the performance impact would have
- * been worse for working memory. */
- bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
- bool is_image = is_texture && (mem.data_height > 1);
-
- size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
- size_t total = 0, free = 0;
- cuMemGetInfo(&free, &total);
-
- /* Move textures to host memory if needed. */
- if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
- move_textures_to_host(size + headroom - free, is_texture);
- cuMemGetInfo(&free, &total);
- }
-
- /* Allocate in device memory. */
- if (!move_texture_to_host && (size + headroom) < free) {
- mem_alloc_result = cuMemAlloc(&device_pointer, size);
- if (mem_alloc_result == CUDA_SUCCESS) {
- status = " in device memory";
- }
- }
-
- /* Fall back to mapped host memory if needed and possible. */
-
- void *shared_pointer = 0;
-
- if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
- if (mem.shared_pointer) {
- /* Another device already allocated host memory. */
- mem_alloc_result = CUDA_SUCCESS;
- shared_pointer = mem.shared_pointer;
- }
- else if (map_host_used + size < map_host_limit) {
- /* Allocate host memory ourselves. */
- mem_alloc_result = cuMemHostAlloc(
- &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
- assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
- (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
- }
-
- if (mem_alloc_result == CUDA_SUCCESS) {
- cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
- map_host_used += size;
- status = " in host memory";
- }
- }
-
- if (mem_alloc_result != CUDA_SUCCESS) {
- if (mem.type == MEM_DEVICE_ONLY) {
- status = " failed, out of device memory";
- set_error("System is out of GPU memory");
- }
- else {
- status = " failed, out of device and host memory";
- set_error("System is out of GPU and shared host memory");
- }
- }
-
- if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")" << status;
- }
-
- mem.device_pointer = (device_ptr)device_pointer;
- mem.device_size = size;
- stats.mem_alloc(size);
-
- if (!mem.device_pointer) {
- return NULL;
- }
-
- /* Insert into map of allocations. */
- thread_scoped_lock lock(cuda_mem_map_mutex);
- CUDAMem *cmem = &cuda_mem_map[&mem];
- if (shared_pointer != 0) {
- /* Replace host pointer with our host allocation. Only works if
- * CUDA memory layout is the same and has no pitch padding. Also
- * does not work if we move textures to host during a render,
- * since other devices might be using the memory. */
-
- if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
- mem.host_pointer != shared_pointer) {
- memcpy(shared_pointer, mem.host_pointer, size);
-
- /* A Call to device_memory::host_free() should be preceded by
- * a call to device_memory::device_free() for host memory
- * allocated by a device to be handled properly. Two exceptions
- * are here and a call in OptiXDevice::generic_alloc(), where
- * the current host memory can be assumed to be allocated by
- * device_memory::host_alloc(), not by a device */
-
- mem.host_free();
- mem.host_pointer = shared_pointer;
- }
- mem.shared_pointer = shared_pointer;
- mem.shared_counter++;
- cmem->use_mapped_host = true;
- }
- else {
- cmem->use_mapped_host = false;
- }
-
- return cmem;
-}
-
-void CUDADevice::generic_copy_to(device_memory &mem)
-{
- if (!mem.host_pointer || !mem.device_pointer) {
- return;
- }
-
- /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
- * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
- * mem.host_pointer. */
- thread_scoped_lock lock(cuda_mem_map_mutex);
- if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
- const CUDAContextScope scope(this);
- cuda_assert(
- cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
- }
-}
-
-void CUDADevice::generic_free(device_memory &mem)
-{
- if (mem.device_pointer) {
- CUDAContextScope scope(this);
- thread_scoped_lock lock(cuda_mem_map_mutex);
- const CUDAMem &cmem = cuda_mem_map[&mem];
-
- /* If cmem.use_mapped_host is true, reference counting is used
- * to safely free a mapped host memory. */
-
- if (cmem.use_mapped_host) {
- assert(mem.shared_pointer);
- if (mem.shared_pointer) {
- assert(mem.shared_counter > 0);
- if (--mem.shared_counter == 0) {
- if (mem.host_pointer == mem.shared_pointer) {
- mem.host_pointer = 0;
- }
- cuMemFreeHost(mem.shared_pointer);
- mem.shared_pointer = 0;
- }
- }
- map_host_used -= mem.device_size;
- }
- else {
- /* Free device memory. */
- cuda_assert(cuMemFree(mem.device_pointer));
- }
-
- stats.mem_free(mem.device_size);
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- cuda_mem_map.erase(cuda_mem_map.find(&mem));
- }
-}
-
-void CUDADevice::mem_alloc(device_memory &mem)
-{
- if (mem.type == MEM_PIXELS && !background) {
- pixels_alloc(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- assert(!"mem_alloc not supported for textures.");
- }
- else if (mem.type == MEM_GLOBAL) {
- assert(!"mem_alloc not supported for global memory.");
- }
- else {
- generic_alloc(mem);
- }
-}
-
-void CUDADevice::mem_copy_to(device_memory &mem)
-{
- if (mem.type == MEM_PIXELS) {
- assert(!"mem_copy_to not supported for pixels.");
- }
- else if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- global_alloc(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- tex_alloc((device_texture &)mem);
- }
- else {
- if (!mem.device_pointer) {
- generic_alloc(mem);
- }
- generic_copy_to(mem);
- }
-}
-
-void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
- if (mem.type == MEM_PIXELS && !background) {
- pixels_copy_from(mem, y, w, h);
- }
- else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
- assert(!"mem_copy_from not supported for textures.");
- }
- else if (mem.host_pointer) {
- const size_t size = elem * w * h;
- const size_t offset = elem * y * w;
-
- if (mem.device_pointer) {
- const CUDAContextScope scope(this);
- cuda_assert(cuMemcpyDtoH(
- (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
- }
- else {
- memset((char *)mem.host_pointer + offset, 0, size);
- }
- }
-}
-
-void CUDADevice::mem_zero(device_memory &mem)
-{
- if (!mem.device_pointer) {
- mem_alloc(mem);
- }
- if (!mem.device_pointer) {
- return;
- }
-
- /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
- * regardless of mem.host_pointer and mem.shared_pointer. */
- thread_scoped_lock lock(cuda_mem_map_mutex);
- if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
- const CUDAContextScope scope(this);
- cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
- }
- else if (mem.host_pointer) {
- memset(mem.host_pointer, 0, mem.memory_size());
- }
-}
-
-void CUDADevice::mem_free(device_memory &mem)
-{
- if (mem.type == MEM_PIXELS && !background) {
- pixels_free(mem);
- }
- else if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- }
- else {
- generic_free(mem);
- }
-}
-
-device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
-{
- return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-}
-
-void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
-{
- CUDAContextScope scope(this);
- CUdeviceptr mem;
- size_t bytes;
-
- cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
- // assert(bytes == size);
- cuda_assert(cuMemcpyHtoD(mem, host, size));
-}
-
-void CUDADevice::global_alloc(device_memory &mem)
-{
- if (mem.is_resident(this)) {
- generic_alloc(mem);
- generic_copy_to(mem);
- }
-
- const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
-}
-
-void CUDADevice::global_free(device_memory &mem)
-{
- if (mem.is_resident(this) && mem.device_pointer) {
- generic_free(mem);
- }
-}
-
-void CUDADevice::tex_alloc(device_texture &mem)
-{
- CUDAContextScope scope(this);
-
- /* General variables for both architectures */
- string bind_name = mem.name;
- size_t dsize = datatype_size(mem.data_type);
- size_t size = mem.memory_size();
-
- CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
- switch (mem.info.extension) {
- case EXTENSION_REPEAT:
- address_mode = CU_TR_ADDRESS_MODE_WRAP;
- break;
- case EXTENSION_EXTEND:
- address_mode = CU_TR_ADDRESS_MODE_CLAMP;
- break;
- case EXTENSION_CLIP:
- address_mode = CU_TR_ADDRESS_MODE_BORDER;
- break;
- default:
- assert(0);
- break;
- }
-
- CUfilter_mode filter_mode;
- if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
- filter_mode = CU_TR_FILTER_MODE_POINT;
- }
- else {
- filter_mode = CU_TR_FILTER_MODE_LINEAR;
- }
-
- /* Image Texture Storage */
- CUarray_format_enum format;
- switch (mem.data_type) {
- case TYPE_UCHAR:
- format = CU_AD_FORMAT_UNSIGNED_INT8;
- break;
- case TYPE_UINT16:
- format = CU_AD_FORMAT_UNSIGNED_INT16;
- break;
- case TYPE_UINT:
- format = CU_AD_FORMAT_UNSIGNED_INT32;
- break;
- case TYPE_INT:
- format = CU_AD_FORMAT_SIGNED_INT32;
- break;
- case TYPE_FLOAT:
- format = CU_AD_FORMAT_FLOAT;
- break;
- case TYPE_HALF:
- format = CU_AD_FORMAT_HALF;
- break;
- default:
- assert(0);
- return;
- }
-
- CUDAMem *cmem = NULL;
- CUarray array_3d = NULL;
- size_t src_pitch = mem.data_width * dsize * mem.data_elements;
- size_t dst_pitch = src_pitch;
-
- if (!mem.is_resident(this)) {
- thread_scoped_lock lock(cuda_mem_map_mutex);
- cmem = &cuda_mem_map[&mem];
- cmem->texobject = 0;
-
- if (mem.data_depth > 1) {
- array_3d = (CUarray)mem.device_pointer;
- cmem->array = array_3d;
- }
- else if (mem.data_height > 0) {
- dst_pitch = align_up(src_pitch, pitch_alignment);
- }
- }
- else if (mem.data_depth > 1) {
- /* 3D texture using array, there is no API for linear memory. */
- CUDA_ARRAY3D_DESCRIPTOR desc;
-
- desc.Width = mem.data_width;
- desc.Height = mem.data_height;
- desc.Depth = mem.data_depth;
- desc.Format = format;
- desc.NumChannels = mem.data_elements;
- desc.Flags = 0;
-
- VLOG(1) << "Array 3D allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
- if (!array_3d) {
- return;
- }
-
- CUDA_MEMCPY3D param;
- memset(&param, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- param.dstArray = array_3d;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = mem.host_pointer;
- param.srcPitch = src_pitch;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
- param.Depth = mem.data_depth;
-
- cuda_assert(cuMemcpy3D(&param));
-
- mem.device_pointer = (device_ptr)array_3d;
- mem.device_size = size;
- stats.mem_alloc(size);
-
- thread_scoped_lock lock(cuda_mem_map_mutex);
- cmem = &cuda_mem_map[&mem];
- cmem->texobject = 0;
- cmem->array = array_3d;
- }
- else if (mem.data_height > 0) {
- /* 2D texture, using pitch aligned linear memory. */
- dst_pitch = align_up(src_pitch, pitch_alignment);
- size_t dst_size = dst_pitch * mem.data_height;
-
- cmem = generic_alloc(mem, dst_size - mem.memory_size());
- if (!cmem) {
- return;
- }
-
- CUDA_MEMCPY2D param;
- memset(&param, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
- param.dstDevice = mem.device_pointer;
- param.dstPitch = dst_pitch;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = mem.host_pointer;
- param.srcPitch = src_pitch;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
-
- cuda_assert(cuMemcpy2DUnaligned(&param));
- }
- else {
- /* 1D texture, using linear memory. */
- cmem = generic_alloc(mem);
- if (!cmem) {
- return;
- }
-
- cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
- }
-
- /* Resize once */
- const uint slot = mem.slot;
- if (slot >= texture_info.size()) {
- /* Allocate some slots in advance, to reduce amount
- * of re-allocations. */
- texture_info.resize(slot + 128);
- }
-
- /* Set Mapping and tag that we need to (re-)upload to device */
- texture_info[slot] = mem.info;
- need_texture_info = true;
-
- if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
- mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
- /* Kepler+, bindless textures. */
- CUDA_RESOURCE_DESC resDesc;
- memset(&resDesc, 0, sizeof(resDesc));
-
- if (array_3d) {
- resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
- resDesc.res.array.hArray = array_3d;
- resDesc.flags = 0;
- }
- else if (mem.data_height > 0) {
- resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
- resDesc.res.pitch2D.devPtr = mem.device_pointer;
- resDesc.res.pitch2D.format = format;
- resDesc.res.pitch2D.numChannels = mem.data_elements;
- resDesc.res.pitch2D.height = mem.data_height;
- resDesc.res.pitch2D.width = mem.data_width;
- resDesc.res.pitch2D.pitchInBytes = dst_pitch;
- }
- else {
- resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
- resDesc.res.linear.devPtr = mem.device_pointer;
- resDesc.res.linear.format = format;
- resDesc.res.linear.numChannels = mem.data_elements;
- resDesc.res.linear.sizeInBytes = mem.device_size;
- }
-
- CUDA_TEXTURE_DESC texDesc;
- memset(&texDesc, 0, sizeof(texDesc));
- texDesc.addressMode[0] = address_mode;
- texDesc.addressMode[1] = address_mode;
- texDesc.addressMode[2] = address_mode;
- texDesc.filterMode = filter_mode;
- texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
- thread_scoped_lock lock(cuda_mem_map_mutex);
- cmem = &cuda_mem_map[&mem];
-
- cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
- texture_info[slot].data = (uint64_t)cmem->texobject;
- }
- else {
- texture_info[slot].data = (uint64_t)mem.device_pointer;
- }
-}
-
-void CUDADevice::tex_free(device_texture &mem)
-{
- if (mem.device_pointer) {
- CUDAContextScope scope(this);
- thread_scoped_lock lock(cuda_mem_map_mutex);
- const CUDAMem &cmem = cuda_mem_map[&mem];
-
- if (cmem.texobject) {
- /* Free bindless texture. */
- cuTexObjectDestroy(cmem.texobject);
- }
-
- if (!mem.is_resident(this)) {
- /* Do not free memory here, since it was allocated on a different device. */
- cuda_mem_map.erase(cuda_mem_map.find(&mem));
- }
- else if (cmem.array) {
- /* Free array. */
- cuArrayDestroy(cmem.array);
- stats.mem_free(mem.device_size);
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- cuda_mem_map.erase(cuda_mem_map.find(&mem));
- }
- else {
- lock.unlock();
- generic_free(mem);
- }
- }
-}
-
-# define CUDA_GET_BLOCKSIZE(func, w, h) \
- int threads_per_block; \
- cuda_assert( \
- cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
- int threads = (int)sqrt((float)threads_per_block); \
- int xblocks = ((w) + threads - 1) / threads; \
- int yblocks = ((h) + threads - 1) / threads;
-
-# define CUDA_LAUNCH_KERNEL(func, args) \
- cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
- int threads_per_block; \
- cuda_assert( \
- cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
- int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
- int yblocks = h;
-
-# define CUDA_LAUNCH_KERNEL_1D(func, args) \
- cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
-
-bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- int stride = task->buffer.stride;
- int w = task->buffer.width;
- int h = task->buffer.h;
- int r = task->nlm_state.r;
- int f = task->nlm_state.f;
- float a = task->nlm_state.a;
- float k_2 = task->nlm_state.k_2;
-
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2 * r + 1) * (2 * r + 1);
- int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
- int frame_offset = 0;
-
- if (have_error())
- return false;
-
- CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
- CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
- CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
- CUdeviceptr scale_ptr = 0;
-
- cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
- cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
-
- {
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
- cuda_assert(cuModuleGetFunction(
- &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(cuModuleGetFunction(
- &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(
- &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
- CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
- void *calc_difference_args[] = {&guide_ptr,
- &variance_ptr,
- &scale_ptr,
- &difference,
- &w,
- &h,
- &stride,
- &pass_stride,
- &r,
- &channel_offset,
- &frame_offset,
- &a,
- &k_2};
- void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
- void *calc_weight_args[] = {
- &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *update_output_args[] = {&blurDifference,
- &image_ptr,
- &out_ptr,
- &weightAccum,
- &w,
- &h,
- &stride,
- &pass_stride,
- &channel_offset,
- &r,
- &f};
-
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
- }
-
- {
- CUfunction cuNLMNormalize;
- cuda_assert(
- cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
- cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
- void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
- CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
- CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
- cuda_assert(cuCtxSynchronize());
- }
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterConstructTransform;
- cuda_assert(cuModuleGetFunction(
- &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
- CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
- void *args[] = {&task->buffer.mem.device_pointer,
- &task->tile_info_mem.device_pointer,
- &task->storage.transform.device_pointer,
- &task->storage.rank.device_pointer,
- &task->filter_area,
- &task->rect,
- &task->radius,
- &task->pca_threshold,
- &task->buffer.pass_stride,
- &task->buffer.frame_stride,
- &task->buffer.use_time};
- CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- int r = task->radius;
- int f = 4;
- float a = 1.0f;
- float k_2 = task->nlm_k_2;
-
- int w = task->reconstruction_state.source_w;
- int h = task->reconstruction_state.source_h;
- int stride = task->buffer.stride;
- int frame_offset = frame * task->buffer.frame_stride;
- int t = task->tile_info->frames[frame];
-
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2 * r + 1) * (2 * r + 1);
-
- if (have_error())
- return false;
-
- CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
- CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
- cuda_assert(cuModuleGetFunction(
- &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(
- cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(
- &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
- CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
- task->reconstruction_state.source_w * task->reconstruction_state.source_h,
- num_shifts);
-
- void *calc_difference_args[] = {&color_ptr,
- &color_variance_ptr,
- &scale_ptr,
- &difference,
- &w,
- &h,
- &stride,
- &pass_stride,
- &r,
- &pass_stride,
- &frame_offset,
- &a,
- &k_2};
- void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
- void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *construct_gramian_args[] = {&t,
- &blurDifference,
- &task->buffer.mem.device_pointer,
- &task->storage.transform.device_pointer,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->reconstruction_state.filter_window,
- &w,
- &h,
- &stride,
- &pass_stride,
- &r,
- &f,
- &frame_offset,
- &task->buffer.use_time};
-
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
- CUfunction cuFinalize;
- cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
- cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
- void *finalize_args[] = {&output_ptr,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->filter_area,
- &task->reconstruction_state.buffer_params.x,
- &task->render_buffer.samples};
- CUDA_GET_BLOCKSIZE(
- cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
- CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterCombineHalves;
- cuda_assert(cuModuleGetFunction(
- &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(
- cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
- CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterDivideShadow;
- cuda_assert(cuModuleGetFunction(
- &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(
- cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- void *args[] = {&task->render_buffer.samples,
- &task->tile_info_mem.device_pointer,
- &a_ptr,
- &b_ptr,
- &sample_variance_ptr,
- &sv_variance_ptr,
- &buffer_variance_ptr,
- &task->rect,
- &task->render_buffer.pass_stride,
- &task->render_buffer.offset};
- CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterGetFeature;
- cuda_assert(
- cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- void *args[] = {&task->render_buffer.samples,
- &task->tile_info_mem.device_pointer,
- &mean_offset,
- &variance_offset,
- &mean_ptr,
- &variance_ptr,
- &scale,
- &task->rect,
- &task->render_buffer.pass_stride,
- &task->render_buffer.offset};
- CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_write_feature(int out_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterWriteFeature;
- cuda_assert(cuModuleGetFunction(
- &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
- void *args[] = {&task->render_buffer.samples,
- &task->reconstruction_state.buffer_params,
- &task->filter_area,
- &from_ptr,
- &buffer_ptr,
- &out_offset,
- &task->rect};
- CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterDetectOutliers;
- cuda_assert(cuModuleGetFunction(
- &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(
- cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- void *args[] = {
- &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
-
- CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
- denoising.functions.construct_transform = function_bind(
- &CUDADevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(
- &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
- denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
- denoising.functions.divide_shadow = function_bind(
- &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.non_local_means = function_bind(
- &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
- denoising.functions.combine_halves = function_bind(
- &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(
- &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.write_feature = function_bind(
- &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
- denoising.functions.detect_outliers = function_bind(
- &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
- denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
- denoising.render_buffer.samples = rtile.sample;
- denoising.buffer.gpu_temporary_mem = true;
-
- denoising.run_denoising(rtile);
-}
-
-void CUDADevice::adaptive_sampling_filter(uint filter_sample,
- WorkTile *wtile,
- CUdeviceptr d_wtile,
- CUstream stream)
-{
- const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-
- /* These are a series of tiny kernels because there is no grid synchronization
- * from within a kernel, so multiple kernel launches it is. */
- uint total_work_size = wtile->h * wtile->w;
- void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
- uint num_blocks = divide_up(total_work_size, num_threads_per_block);
- cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
- num_blocks,
- 1,
- 1,
- num_threads_per_block,
- 1,
- 1,
- 0,
- stream,
- args2,
- 0));
- total_work_size = wtile->h;
- num_blocks = divide_up(total_work_size, num_threads_per_block);
- cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
- num_blocks,
- 1,
- 1,
- num_threads_per_block,
- 1,
- 1,
- 0,
- stream,
- args2,
- 0));
- total_work_size = wtile->w;
- num_blocks = divide_up(total_work_size, num_threads_per_block);
- cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
- num_blocks,
- 1,
- 1,
- num_threads_per_block,
- 1,
- 1,
- 0,
- stream,
- args2,
- 0));
-}
-
-void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
- WorkTile *wtile,
- CUdeviceptr d_wtile,
- CUstream stream)
-{
- const int num_threads_per_block = functions.adaptive_num_threads_per_block;
- uint total_work_size = wtile->h * wtile->w;
-
- void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
- uint num_blocks = divide_up(total_work_size, num_threads_per_block);
- cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
- num_blocks,
- 1,
- 1,
- num_threads_per_block,
- 1,
- 1,
- 0,
- stream,
- args,
- 0));
-}
-
-void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
-{
- scoped_timer timer(&rtile.buffers->render_time);
-
- if (have_error())
- return;
-
- CUDAContextScope scope(this);
- CUfunction cuRender;
-
- /* Get kernel function. */
- if (rtile.task == RenderTile::BAKE) {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
- }
- else if (task.integrator_branched) {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
- }
-
- if (have_error()) {
- return;
- }
-
- cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
- /* Allocate work tile. */
- work_tiles.alloc(1);
-
- WorkTile *wtile = work_tiles.data();
- wtile->x = rtile.x;
- wtile->y = rtile.y;
- wtile->w = rtile.w;
- wtile->h = rtile.h;
- wtile->offset = rtile.offset;
- wtile->stride = rtile.stride;
- wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
-
- /* Prepare work size. More step samples render faster, but for now we
- * remain conservative for GPUs connected to a display to avoid driver
- * timeouts and display freezing. */
- int min_blocks, num_threads_per_block;
- cuda_assert(
- cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
- if (!info.display_device) {
- min_blocks *= 8;
- }
-
- uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
- /* Render all samples. */
- int start_sample = rtile.start_sample;
- int end_sample = rtile.start_sample + rtile.num_samples;
-
- for (int sample = start_sample; sample < end_sample;) {
- /* Setup and copy work tile to device. */
- wtile->start_sample = sample;
- wtile->num_samples = step_samples;
- if (task.adaptive_sampling.use) {
- wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
- }
- wtile->num_samples = min(wtile->num_samples, end_sample - sample);
- work_tiles.copy_to_device();
-
- CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
- uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
- uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
- /* Launch kernel. */
- void *args[] = {&d_work_tiles, &total_work_size};
-
- cuda_assert(
- cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
- /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
- uint filter_sample = sample + wtile->num_samples - 1;
- if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
- adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
- }
-
- cuda_assert(cuCtxSynchronize());
-
- /* Update progress. */
- sample += wtile->num_samples;
- rtile.sample = sample;
- task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
- if (task.get_cancel()) {
- if (task.need_finish_queue == false)
- break;
- }
- }
-
- /* Finalize adaptive sampling. */
- if (task.adaptive_sampling.use) {
- CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
- adaptive_sampling_post(rtile, wtile, d_work_tiles);
- cuda_assert(cuCtxSynchronize());
- task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
- }
-}
-
-void CUDADevice::film_convert(DeviceTask &task,
- device_ptr buffer,
- device_ptr rgba_byte,
- device_ptr rgba_half)
-{
- if (have_error())
- return;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilmConvert;
- CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
- CUdeviceptr d_buffer = (CUdeviceptr)buffer;
-
- /* get kernel function */
- if (rgba_half) {
- cuda_assert(
- cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
- }
-
- float sample_scale = 1.0f / (task.sample + 1);
-
- /* pass in parameters */
- void *args[] = {&d_rgba,
- &d_buffer,
- &sample_scale,
- &task.x,
- &task.y,
- &task.w,
- &task.h,
- &task.offset,
- &task.stride};
-
- /* launch kernel */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(
- &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
- int xthreads = (int)sqrt(threads_per_block);
- int ythreads = (int)sqrt(threads_per_block);
- int xblocks = (task.w + xthreads - 1) / xthreads;
- int yblocks = (task.h + ythreads - 1) / ythreads;
-
- cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
- cuda_assert(cuLaunchKernel(cuFilmConvert,
- xblocks,
- yblocks,
- 1, /* blocks */
- xthreads,
- ythreads,
- 1, /* threads */
- 0,
- 0,
- args,
- 0));
-
- unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
-
- cuda_assert(cuCtxSynchronize());
-}
-
-void CUDADevice::shader(DeviceTask &task)
-{
- if (have_error())
- return;
-
- CUDAContextScope scope(this);
-
- CUfunction cuShader;
- CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
- CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
-
- /* get kernel function */
- if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
- cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
- }
-
- /* do tasks in smaller chunks, so we can cancel it */
- const int shader_chunk_size = 65536;
- const int start = task.shader_x;
- const int end = task.shader_x + task.shader_w;
- int offset = task.offset;
-
- bool canceled = false;
- for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
- for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
- int shader_w = min(shader_chunk_size, end - shader_x);
-
- /* pass in parameters */
- void *args[8];
- int arg = 0;
- args[arg++] = &d_input;
- args[arg++] = &d_output;
- args[arg++] = &task.shader_eval_type;
- if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
- args[arg++] = &task.shader_filter;
- }
- args[arg++] = &shader_x;
- args[arg++] = &shader_w;
- args[arg++] = &offset;
- args[arg++] = &sample;
-
- /* launch kernel */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(
- &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
- int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
-
- cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuLaunchKernel(cuShader,
- xblocks,
- 1,
- 1, /* blocks */
- threads_per_block,
- 1,
- 1, /* threads */
- 0,
- 0,
- args,
- 0));
-
- cuda_assert(cuCtxSynchronize());
-
- if (task.get_cancel()) {
- canceled = true;
- break;
- }
- }
-
- task.update_progress(NULL);
- }
-}
-
-CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
-{
- if (!background) {
- PixelMem pmem = pixel_mem_map[mem];
- CUdeviceptr buffer;
-
- size_t bytes;
- cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
- cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
- return buffer;
- }
-
- return (CUdeviceptr)mem;
-}
-
-void CUDADevice::unmap_pixels(device_ptr mem)
-{
- if (!background) {
- PixelMem pmem = pixel_mem_map[mem];
-
- cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
- }
-}
-
-void CUDADevice::pixels_alloc(device_memory &mem)
-{
- PixelMem pmem;
-
- pmem.w = mem.data_width;
- pmem.h = mem.data_height;
-
- CUDAContextScope scope(this);
-
- glGenBuffers(1, &pmem.cuPBO);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
- if (mem.data_type == TYPE_HALF)
- glBufferData(
- GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
- else
- glBufferData(
- GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
-
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
- glActiveTexture(GL_TEXTURE0);
- glGenTextures(1, &pmem.cuTexId);
- glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
- if (mem.data_type == TYPE_HALF)
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
- else
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
- glBindTexture(GL_TEXTURE_2D, 0);
-
- CUresult result = cuGraphicsGLRegisterBuffer(
- &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
- if (result == CUDA_SUCCESS) {
- mem.device_pointer = pmem.cuTexId;
- pixel_mem_map[mem.device_pointer] = pmem;
-
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
-
- return;
- }
- else {
- /* failed to register buffer, fallback to no interop */
- glDeleteBuffers(1, &pmem.cuPBO);
- glDeleteTextures(1, &pmem.cuTexId);
-
- background = true;
- }
-}
-
-void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
-{
- PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
- CUDAContextScope scope(this);
-
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
- uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
- size_t offset = sizeof(uchar) * 4 * y * w;
- memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
- glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-}
-
-void CUDADevice::pixels_free(device_memory &mem)
-{
- if (mem.device_pointer) {
- PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
- CUDAContextScope scope(this);
-
- cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
- glDeleteBuffers(1, &pmem.cuPBO);
- glDeleteTextures(1, &pmem.cuTexId);
-
- pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
- mem.device_pointer = 0;
-
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
-}
-
-void CUDADevice::draw_pixels(device_memory &mem,
- int y,
- int w,
- int h,
- int width,
- int height,
- int dx,
- int dy,
- int dw,
- int dh,
- bool transparent,
- const DeviceDrawParams &draw_params)
-{
- assert(mem.type == MEM_PIXELS);
-
- if (!background) {
- const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
- PixelMem pmem = pixel_mem_map[mem.device_pointer];
- float *vpointer;
-
- CUDAContextScope scope(this);
-
- /* for multi devices, this assumes the inefficient method that we allocate
- * all pixels on the device even though we only render to a subset */
- size_t offset = 4 * y * w;
-
- if (mem.data_type == TYPE_HALF)
- offset *= sizeof(GLhalf);
- else
- offset *= sizeof(uint8_t);
-
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
- glActiveTexture(GL_TEXTURE0);
- glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
- if (mem.data_type == TYPE_HALF) {
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
- }
- else {
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
- }
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
- if (transparent) {
- glEnable(GL_BLEND);
- glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
- }
-
- GLint shader_program;
- if (use_fallback_shader) {
- if (!bind_fallback_display_space_shader(dw, dh)) {
- return;
- }
- shader_program = fallback_shader_program;
- }
- else {
- draw_params.bind_display_space_shader_cb();
- glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
- }
-
- if (!vertex_buffer) {
- glGenBuffers(1, &vertex_buffer);
- }
-
- glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
- /* invalidate old contents -
- * avoids stalling if buffer is still waiting in queue to be rendered */
- glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
- vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
- if (vpointer) {
- /* texture coordinate - vertex pair */
- vpointer[0] = 0.0f;
- vpointer[1] = 0.0f;
- vpointer[2] = dx;
- vpointer[3] = dy;
-
- vpointer[4] = (float)w / (float)pmem.w;
- vpointer[5] = 0.0f;
- vpointer[6] = (float)width + dx;
- vpointer[7] = dy;
-
- vpointer[8] = (float)w / (float)pmem.w;
- vpointer[9] = (float)h / (float)pmem.h;
- vpointer[10] = (float)width + dx;
- vpointer[11] = (float)height + dy;
-
- vpointer[12] = 0.0f;
- vpointer[13] = (float)h / (float)pmem.h;
- vpointer[14] = dx;
- vpointer[15] = (float)height + dy;
-
- glUnmapBuffer(GL_ARRAY_BUFFER);
- }
-
- GLuint vertex_array_object;
- GLuint position_attribute, texcoord_attribute;
-
- glGenVertexArrays(1, &vertex_array_object);
- glBindVertexArray(vertex_array_object);
-
- texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
- position_attribute = glGetAttribLocation(shader_program, "pos");
-
- glEnableVertexAttribArray(texcoord_attribute);
- glEnableVertexAttribArray(position_attribute);
-
- glVertexAttribPointer(
- texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
- glVertexAttribPointer(position_attribute,
- 2,
- GL_FLOAT,
- GL_FALSE,
- 4 * sizeof(float),
- (const GLvoid *)(sizeof(float) * 2));
-
- glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
- if (use_fallback_shader) {
- glUseProgram(0);
- }
- else {
- draw_params.unbind_display_space_shader_cb();
- }
-
- if (transparent) {
- glDisable(GL_BLEND);
- }
-
- glBindTexture(GL_TEXTURE_2D, 0);
-
- return;
- }
-
- Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-}
-
-void CUDADevice::thread_run(DeviceTask &task)
-{
- CUDAContextScope scope(this);
-
- if (task.type == DeviceTask::RENDER) {
- DeviceRequestedFeatures requested_features;
- if (use_split_kernel()) {
- if (split_kernel == NULL) {
- split_kernel = new CUDASplitKernel(this);
- split_kernel->load_kernels(requested_features);
- }
- }
-
- device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
- /* keep rendering tiles until done */
- RenderTile tile;
- DenoisingTask denoising(this, task);
-
- while (task.acquire_tile(this, tile, task.tile_types)) {
- if (tile.task == RenderTile::PATH_TRACE) {
- if (use_split_kernel()) {
- device_only_memory<uchar> void_buffer(this, "void_buffer");
- split_kernel->path_trace(task, tile, void_buffer, void_buffer);
- }
- else {
- render(task, tile, work_tiles);
- }
- }
- else if (tile.task == RenderTile::BAKE) {
- render(task, tile, work_tiles);
- }
- else if (tile.task == RenderTile::DENOISE) {
- tile.sample = tile.start_sample + tile.num_samples;
-
- denoise(tile, denoising);
-
- task.update_progress(&tile, tile.w * tile.h);
- }
-
- task.release_tile(tile);
-
- if (task.get_cancel()) {
- if (task.need_finish_queue == false)
- break;
- }
- }
-
- work_tiles.free();
- }
- else if (task.type == DeviceTask::SHADER) {
- shader(task);
-
- cuda_assert(cuCtxSynchronize());
- }
- else if (task.type == DeviceTask::DENOISE_BUFFER) {
- RenderTile tile;
- tile.x = task.x;
- tile.y = task.y;
- tile.w = task.w;
- tile.h = task.h;
- tile.buffer = task.buffer;
- tile.sample = task.sample + task.num_samples;
- tile.num_samples = task.num_samples;
- tile.start_sample = task.sample;
- tile.offset = task.offset;
- tile.stride = task.stride;
- tile.buffers = task.buffers;
-
- DenoisingTask denoising(this, task);
- denoise(tile, denoising);
- task.update_progress(&tile, tile.w * tile.h);
- }
-}
-
-void CUDADevice::task_add(DeviceTask &task)
-{
- CUDAContextScope scope(this);
-
- /* Load texture info. */
- load_texture_info();
-
- /* Synchronize all memory copies before executing task. */
- cuda_assert(cuCtxSynchronize());
-
- if (task.type == DeviceTask::FILM_CONVERT) {
- /* must be done in main thread due to opengl access */
- film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
- }
- else {
- task_pool.push([=] {
- DeviceTask task_copy = task;
- thread_run(task_copy);
- });
- }
-}
-
-void CUDADevice::task_wait()
-{
- task_pool.wait();
-}
-
-void CUDADevice::task_cancel()
-{
- task_pool.cancel();
-}
-
-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-# undef cuda_assert
-# define cuda_assert(stmt) \
- { \
- CUresult result = stmt; \
- if (result != CUDA_SUCCESS) { \
- const char *name = cuewErrorString(result); \
- device->set_error( \
- string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
- } \
- } \
- (void)0
-
-/* CUDA context scope. */
-
-CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
-{
- cuda_assert(cuCtxPushCurrent(device->cuContext));
-}
-
-CUDAContextScope::~CUDAContextScope()
-{
- cuda_assert(cuCtxPopCurrent(NULL));
-}
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction {
- CUDADevice *device;
- CUfunction func;
-
- public:
- CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
- {
- }
-
- /* enqueue the kernel, returns false if there is an error */
- bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
- {
- return enqueue(dim, NULL);
- }
-
- /* enqueue the kernel, returns false if there is an error */
- bool enqueue(const KernelDimensions &dim, void *args[])
- {
- if (device->have_error())
- return false;
-
- CUDAContextScope scope(device);
-
- /* we ignore dim.local_size for now, as this is faster */
- int threads_per_block;
- cuda_assert(
- cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
- int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
- threads_per_block;
-
- cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
- cuda_assert(cuLaunchKernel(func,
- xblocks,
- 1,
- 1, /* blocks */
- threads_per_block,
- 1,
- 1, /* threads */
- 0,
- 0,
- args,
- 0));
-
- return !device->have_error();
- }
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
- device_memory & /*data*/,
- size_t num_threads)
-{
- CUDAContextScope scope(device);
-
- device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
- size_buffer.alloc(1);
- size_buffer.zero_to_device();
-
- uint threads = num_threads;
- CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
-
- struct args_t {
- uint *num_threads;
- CUdeviceptr *size;
- };
-
- args_t args = {&threads, &d_size};
-
- CUfunction state_buffer_size;
- cuda_assert(
- cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
- cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
-
- size_buffer.copy_from_device(0, 1, 1);
- size_t size = size_buffer[0];
- size_buffer.free();
-
- return size;
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory & /*kernel_globals*/,
- device_memory & /*kernel_data*/,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flag,
- device_memory &work_pool_wgs)
-{
- CUDAContextScope scope(device);
-
- CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
- CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
- CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
- CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
- CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
-
- CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
-
- int end_sample = rtile.start_sample + rtile.num_samples;
- int queue_size = dim.global_size[0] * dim.global_size[1];
-
- struct args_t {
- CUdeviceptr *split_data_buffer;
- int *num_elements;
- CUdeviceptr *ray_state;
- int *start_sample;
- int *end_sample;
- int *sx;
- int *sy;
- int *sw;
- int *sh;
- int *offset;
- int *stride;
- CUdeviceptr *queue_index;
- int *queuesize;
- CUdeviceptr *use_queues_flag;
- CUdeviceptr *work_pool_wgs;
- int *num_samples;
- CUdeviceptr *buffer;
- };
-
- args_t args = {&d_split_data,
- &num_global_elements,
- &d_ray_state,
- &rtile.start_sample,
- &end_sample,
- &rtile.x,
- &rtile.y,
- &rtile.w,
- &rtile.h,
- &rtile.offset,
- &rtile.stride,
- &d_queue_index,
- &queue_size,
- &d_use_queues_flag,
- &d_work_pool_wgs,
- &rtile.num_samples,
- &d_buffer};
-
- CUfunction data_init;
- cuda_assert(
- cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
- if (device->have_error()) {
- return false;
- }
-
- CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
-
- return !device->have_error();
-}
-
-SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
- const DeviceRequestedFeatures &)
-{
- const CUDAContextScope scope(device);
-
- CUfunction func;
- const CUresult result = cuModuleGetFunction(
- &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
- if (result != CUDA_SUCCESS) {
- device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
- kernel_name.data(),
- cuewErrorString(result)));
- return NULL;
- }
-
- return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
- return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
- device_memory &data,
- DeviceTask & /*task*/)
-{
- CUDAContextScope scope(device);
- size_t free;
- size_t total;
-
- cuda_assert(cuMemGetInfo(&free, &total));
-
- VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
- << " bytes. (" << string_human_readable_size(free) << ").";
-
- size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
- size_t side = round_down((int)sqrt(num_elements), 32);
- int2 global_size = make_int2(side, round_down(num_elements / side, 16));
- VLOG(1) << "Global size: " << global_size << ".";
- return global_size;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
new file mode 100644
index 00000000000..37fab8f8293
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -0,0 +1,1370 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include <climits>
+# include <limits.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <string.h>
+
+# include "device/cuda/device_impl.h"
+
+# include "render/buffers.h"
+
+# include "util/util_debug.h"
+# include "util/util_foreach.h"
+# include "util/util_logging.h"
+# include "util/util_map.h"
+# include "util/util_md5.h"
+# include "util/util_opengl.h"
+# include "util/util_path.h"
+# include "util/util_string.h"
+# include "util/util_system.h"
+# include "util/util_time.h"
+# include "util/util_types.h"
+# include "util/util_windows.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+bool CUDADevice::have_precompiled_kernels()
+{
+ string cubins_path = path_get("lib");
+ return path_exists(cubins_path);
+}
+
+bool CUDADevice::show_samples() const
+{
+ /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+ return true;
+}
+
+BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+{
+ return BVH_LAYOUT_BVH2;
+}
+
+void CUDADevice::set_error(const string &error)
+{
+ Device::set_error(error);
+
+ if (first_error) {
+ fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+ fprintf(stderr,
+ "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+ first_error = false;
+ }
+}
+
+CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+ : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+ first_error = true;
+
+ cuDevId = info.num;
+ cuDevice = 0;
+ cuContext = 0;
+
+ cuModule = 0;
+
+ need_texture_info = false;
+
+ device_texture_headroom = 0;
+ device_working_headroom = 0;
+ move_texture_to_host = false;
+ map_host_limit = 0;
+ map_host_used = 0;
+ can_map_host = 0;
+ pitch_alignment = 0;
+
+ /* Initialize CUDA. */
+ CUresult result = cuInit(0);
+ if (result != CUDA_SUCCESS) {
+ set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
+ return;
+ }
+
+ /* Setup device and context. */
+ result = cuDeviceGet(&cuDevice, cuDevId);
+ if (result != CUDA_SUCCESS) {
+ set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
+ cuewErrorString(result)));
+ return;
+ }
+
+ /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+ * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+ * so we can predict which memory to map to host. */
+ cuda_assert(
+ cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+ cuda_assert(cuDeviceGetAttribute(
+ &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
+ unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+ if (can_map_host) {
+ ctx_flags |= CU_CTX_MAP_HOST;
+ init_host_memory();
+ }
+
+ /* Create context. */
+ result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+ if (result != CUDA_SUCCESS) {
+ set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
+ return;
+ }
+
+ int major, minor;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+ cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+ cuDevArchitecture = major * 100 + minor * 10;
+
+ /* Pop context set by cuCtxCreate. */
+ cuCtxPopCurrent(NULL);
+}
+
+CUDADevice::~CUDADevice()
+{
+ texture_info.free();
+
+ cuda_assert(cuCtxDestroy(cuContext));
+}
+
+bool CUDADevice::support_device(const uint /*kernel_features*/)
+{
+ int major, minor;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+ cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+ /* We only support sm_30 and above */
+ if (major < 3) {
+ set_error(string_printf(
+ "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
+ return false;
+ }
+
+ return true;
+}
+
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+ if (peer_device == this) {
+ return false;
+ }
+ if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+ return false;
+ }
+
+ CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+ int can_access = 0;
+ cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+ if (can_access == 0) {
+ return false;
+ }
+
+ // Ensure array access over the link is possible as well (for 3D textures)
+ cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+ CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
+ cuDevice,
+ peer_device_cuda->cuDevice));
+ if (can_access == 0) {
+ return false;
+ }
+
+ // Enable peer access in both directions
+ {
+ const CUDAContextScope scope(this);
+ CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
+ if (result != CUDA_SUCCESS) {
+ set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+ cuewErrorString(result)));
+ return false;
+ }
+ }
+ {
+ const CUDAContextScope scope(peer_device_cuda);
+ CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
+ if (result != CUDA_SUCCESS) {
+ set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+ cuewErrorString(result)));
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool CUDADevice::use_adaptive_compilation()
+{
+ return DebugFlags().cuda.adaptive_compile;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+ const int machine = system_cpu_bits();
+ const string source_path = path_get("source");
+ const string include_path = source_path;
+ string cflags = string_printf(
+ "-m%d "
+ "--ptxas-options=\"-v\" "
+ "--use_fast_math "
+ "-DNVCC "
+ "-I\"%s\"",
+ machine,
+ include_path.c_str());
+ if (use_adaptive_compilation()) {
+ cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+ }
+ const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+ if (extra_cflags) {
+ cflags += string(" ") + string(extra_cflags);
+ }
+
+# ifdef WITH_NANOVDB
+ cflags += " -DWITH_NANOVDB";
+# endif
+
+ return cflags;
+}
+
+string CUDADevice::compile_kernel(const uint kernel_features,
+ const char *name,
+ const char *base,
+ bool force_ptx)
+{
+ /* Compute kernel name. */
+ int major, minor;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+ cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+ /* Attempt to use kernel provided with Blender. */
+ if (!use_adaptive_compilation()) {
+ if (!force_ptx) {
+ const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+ VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+ if (path_exists(cubin)) {
+ VLOG(1) << "Using precompiled kernel.";
+ return cubin;
+ }
+ }
+
+ /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+ int ptx_major = major, ptx_minor = minor;
+ while (ptx_major >= 3) {
+ const string ptx = path_get(
+ string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+ VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+ if (path_exists(ptx)) {
+ VLOG(1) << "Using precompiled kernel.";
+ return ptx;
+ }
+
+ if (ptx_minor > 0) {
+ ptx_minor--;
+ }
+ else {
+ ptx_major--;
+ ptx_minor = 9;
+ }
+ }
+ }
+
+ /* Try to use locally compiled kernel. */
+ string source_path = path_get("source");
+ const string source_md5 = path_files_md5_hash(source_path);
+
+ /* We include cflags into md5 so changing cuda toolkit or changing other
+ * compiler command line arguments makes sure cubin gets re-built.
+ */
+ string common_cflags = compile_kernel_get_common_cflags(kernel_features);
+ const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+ const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
+ const char *const kernel_arch = force_ptx ? "compute" : "sm";
+ const string cubin_file = string_printf(
+ "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
+ const string cubin = path_cache_get(path_join("kernels", cubin_file));
+ VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+ if (path_exists(cubin)) {
+ VLOG(1) << "Using locally compiled kernel.";
+ return cubin;
+ }
+
+# ifdef _WIN32
+ if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+ if (major < 3) {
+ set_error(
+ string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
+ "Your GPU is not supported.",
+ major,
+ minor));
+ }
+ else {
+ set_error(
+ string_printf("CUDA binary kernel for this graphics card compute "
+ "capability (%d.%d) not found.",
+ major,
+ minor));
+ }
+ return string();
+ }
+# endif
+
+ /* Compile. */
+ const char *const nvcc = cuewCompilerPath();
+ if (nvcc == NULL) {
+ set_error(
+ "CUDA nvcc compiler not found. "
+ "Install CUDA toolkit in default location.");
+ return string();
+ }
+
+ const int nvcc_cuda_version = cuewCompilerVersion();
+ VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
+ if (nvcc_cuda_version < 101) {
+ printf(
+ "Unsupported CUDA version %d.%d detected, "
+ "you need CUDA 10.1 or newer.\n",
+ nvcc_cuda_version / 10,
+ nvcc_cuda_version % 10);
+ return string();
+ }
+ else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
+ nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
+ printf(
+ "CUDA version %d.%d detected, build may succeed but only "
+ "CUDA 10.1 to 11.4 are officially supported.\n",
+ nvcc_cuda_version / 10,
+ nvcc_cuda_version % 10);
+ }
+
+ double starttime = time_dt();
+
+ path_create_directories(cubin);
+
+ source_path = path_join(path_join(source_path, "kernel"),
+ path_join("device", path_join(base, string_printf("%s.cu", name))));
+
+ string command = string_printf(
+ "\"%s\" "
+ "-arch=%s_%d%d "
+ "--%s \"%s\" "
+ "-o \"%s\" "
+ "%s",
+ nvcc,
+ kernel_arch,
+ major,
+ minor,
+ kernel_ext,
+ source_path.c_str(),
+ cubin.c_str(),
+ common_cflags.c_str());
+
+ printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
+
+# ifdef _WIN32
+ command = "call " + command;
+# endif
+ if (system(command.c_str()) != 0) {
+ set_error(
+ "Failed to execute compilation command, "
+ "see console for details.");
+ return string();
+ }
+
+ /* Verify if compilation succeeded */
+ if (!path_exists(cubin)) {
+ set_error(
+ "CUDA kernel compilation failed, "
+ "see console for details.");
+ return string();
+ }
+
+ printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+ return cubin;
+}
+
+bool CUDADevice::load_kernels(const uint kernel_features)
+{
+ /* TODO(sergey): Support kernels re-load for CUDA devices.
+ *
+ * Currently re-loading kernel will invalidate memory pointers,
+ * causing problems in cuCtxSynchronize.
+ */
+ if (cuModule) {
+ VLOG(1) << "Skipping kernel reload, not currently supported.";
+ return true;
+ }
+
+ /* check if cuda init succeeded */
+ if (cuContext == 0)
+ return false;
+
+ /* check if GPU is supported */
+ if (!support_device(kernel_features))
+ return false;
+
+ /* get kernel */
+ const char *kernel_name = "kernel";
+ string cubin = compile_kernel(kernel_features, kernel_name);
+ if (cubin.empty())
+ return false;
+
+ /* open module */
+ CUDAContextScope scope(this);
+
+ string cubin_data;
+ CUresult result;
+
+ if (path_read_text(cubin, cubin_data))
+ result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+ else
+ result = CUDA_ERROR_FILE_NOT_FOUND;
+
+ if (result != CUDA_SUCCESS)
+ set_error(string_printf(
+ "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
+
+ if (result == CUDA_SUCCESS) {
+ kernels.load(this);
+ reserve_local_memory(kernel_features);
+ }
+
+ return (result == CUDA_SUCCESS);
+}
+
+void CUDADevice::reserve_local_memory(const uint /* kernel_features */)
+{
+ /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+ * needed for kernel launches, so that we can reliably figure out when
+ * to allocate scene data in mapped host memory. */
+ size_t total = 0, free_before = 0, free_after = 0;
+
+ {
+ CUDAContextScope scope(this);
+ cuMemGetInfo(&free_before, &total);
+ }
+
+ {
+ /* Use the biggest kernel for estimation. */
+ const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+
+ /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
+ * multiprocessors. It would be good to do this in parallel for the multi GPU case
+ * still to make it faster. */
+ CUDADeviceQueue queue(this);
+
+ void *d_path_index = nullptr;
+ void *d_render_buffer = nullptr;
+ int d_work_size = 0;
+ void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
+
+ queue.init_execution();
+ queue.enqueue(test_kernel, 1, args);
+ queue.synchronize();
+ }
+
+ {
+ CUDAContextScope scope(this);
+ cuMemGetInfo(&free_after, &total);
+ }
+
+ VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+ << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+# if 0
+ /* For testing mapped host memory, fill up device memory. */
+ const size_t keep_mb = 1024;
+
+ while (free_after > keep_mb * 1024 * 1024LL) {
+ CUdeviceptr tmp;
+ cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+ cuMemGetInfo(&free_after, &total);
+ }
+# endif
+}
+
+void CUDADevice::init_host_memory()
+{
+ /* Limit amount of host mapped memory, because allocating too much can
+ * cause system instability. Leave at least half or 4 GB of system
+ * memory free, whichever is smaller. */
+ size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+ size_t system_ram = system_physical_ram();
+
+ if (system_ram > 0) {
+ if (system_ram / 2 > default_limit) {
+ map_host_limit = system_ram - default_limit;
+ }
+ else {
+ map_host_limit = system_ram / 2;
+ }
+ }
+ else {
+ VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+ map_host_limit = 0;
+ }
+
+ /* Amount of device memory to keep is free after texture memory
+ * and working memory allocations respectively. We set the working
+ * memory limit headroom lower so that some space is left after all
+ * texture memory allocations. */
+ device_working_headroom = 32 * 1024 * 1024LL; // 32MB
+ device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
+
+ VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+ << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void CUDADevice::load_texture_info()
+{
+ if (need_texture_info) {
+ /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+ * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+ need_texture_info = false;
+ texture_info.copy_to_device();
+ }
+}
+
+void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+{
+ /* Break out of recursive call, which can happen when moving memory on a multi device. */
+ static bool any_device_moving_textures_to_host = false;
+ if (any_device_moving_textures_to_host) {
+ return;
+ }
+
+ /* Signal to reallocate textures in host memory only. */
+ move_texture_to_host = true;
+
+ while (size > 0) {
+ /* Find suitable memory allocation to move. */
+ device_memory *max_mem = NULL;
+ size_t max_size = 0;
+ bool max_is_image = false;
+
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+ device_memory &mem = *pair.first;
+ CUDAMem *cmem = &pair.second;
+
+ /* Can only move textures allocated on this device (and not those from peer devices).
+ * And need to ignore memory that is already on the host. */
+ if (!mem.is_resident(this) || cmem->use_mapped_host) {
+ continue;
+ }
+
+ bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+ (&mem != &texture_info);
+ bool is_image = is_texture && (mem.data_height > 1);
+
+ /* Can't move this type of memory. */
+ if (!is_texture || cmem->array) {
+ continue;
+ }
+
+ /* For other textures, only move image textures. */
+ if (for_texture && !is_image) {
+ continue;
+ }
+
+ /* Try to move largest allocation, prefer moving images. */
+ if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+ max_is_image = is_image;
+ max_size = mem.device_size;
+ max_mem = &mem;
+ }
+ }
+ lock.unlock();
+
+ /* Move to host memory. This part is mutex protected since
+ * multiple CUDA devices could be moving the memory. The
+ * first one will do it, and the rest will adopt the pointer. */
+ if (max_mem) {
+ VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+ static thread_mutex move_mutex;
+ thread_scoped_lock lock(move_mutex);
+
+ any_device_moving_textures_to_host = true;
+
+ /* Potentially need to call back into multi device, so pointer mapping
+ * and peer devices are updated. This is also necessary since the device
+ * pointer may just be a key here, so cannot be accessed and freed directly.
+ * Unfortunately it does mean that memory is reallocated on all other
+ * devices as well, which is potentially dangerous when still in use (since
+ * a thread rendering on another devices would only be caught in this mutex
+ * if it so happens to do an allocation at the same time as well. */
+ max_mem->device_copy_to();
+ size = (max_size >= size) ? 0 : size - max_size;
+
+ any_device_moving_textures_to_host = false;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+ move_texture_to_host = false;
+
+ /* Update texture info array with new pointers. */
+ load_texture_info();
+}
+
+CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+ CUDAContextScope scope(this);
+
+ CUdeviceptr device_pointer = 0;
+ size_t size = mem.memory_size() + pitch_padding;
+
+ CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+ const char *status = "";
+
+ /* First try allocating in device memory, respecting headroom. We make
+ * an exception for texture info. It is small and frequently accessed,
+ * so treat it as working memory.
+ *
+ * If there is not enough room for working memory, we will try to move
+ * textures to host memory, assuming the performance impact would have
+ * been worse for working memory. */
+ bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+ bool is_image = is_texture && (mem.data_height > 1);
+
+ size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+ size_t total = 0, free = 0;
+ cuMemGetInfo(&free, &total);
+
+ /* Move textures to host memory if needed. */
+ if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+ move_textures_to_host(size + headroom - free, is_texture);
+ cuMemGetInfo(&free, &total);
+ }
+
+ /* Allocate in device memory. */
+ if (!move_texture_to_host && (size + headroom) < free) {
+ mem_alloc_result = cuMemAlloc(&device_pointer, size);
+ if (mem_alloc_result == CUDA_SUCCESS) {
+ status = " in device memory";
+ }
+ }
+
+ /* Fall back to mapped host memory if needed and possible. */
+
+ void *shared_pointer = 0;
+
+ if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+ if (mem.shared_pointer) {
+ /* Another device already allocated host memory. */
+ mem_alloc_result = CUDA_SUCCESS;
+ shared_pointer = mem.shared_pointer;
+ }
+ else if (map_host_used + size < map_host_limit) {
+ /* Allocate host memory ourselves. */
+ mem_alloc_result = cuMemHostAlloc(
+ &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+ assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+ (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
+ }
+
+ if (mem_alloc_result == CUDA_SUCCESS) {
+ cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
+ map_host_used += size;
+ status = " in host memory";
+ }
+ }
+
+ if (mem_alloc_result != CUDA_SUCCESS) {
+ status = " failed, out of device and host memory";
+ set_error("System is out of GPU and shared host memory");
+ }
+
+ if (mem.name) {
+ VLOG(1) << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")" << status;
+ }
+
+ mem.device_pointer = (device_ptr)device_pointer;
+ mem.device_size = size;
+ stats.mem_alloc(size);
+
+ if (!mem.device_pointer) {
+ return NULL;
+ }
+
+ /* Insert into map of allocations. */
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ CUDAMem *cmem = &cuda_mem_map[&mem];
+ if (shared_pointer != 0) {
+ /* Replace host pointer with our host allocation. Only works if
+ * CUDA memory layout is the same and has no pitch padding. Also
+ * does not work if we move textures to host during a render,
+ * since other devices might be using the memory. */
+
+ if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+ mem.host_pointer != shared_pointer) {
+ memcpy(shared_pointer, mem.host_pointer, size);
+
+ /* A Call to device_memory::host_free() should be preceded by
+ * a call to device_memory::device_free() for host memory
+ * allocated by a device to be handled properly. Two exceptions
+ * are here and a call in OptiXDevice::generic_alloc(), where
+ * the current host memory can be assumed to be allocated by
+ * device_memory::host_alloc(), not by a device */
+
+ mem.host_free();
+ mem.host_pointer = shared_pointer;
+ }
+ mem.shared_pointer = shared_pointer;
+ mem.shared_counter++;
+ cmem->use_mapped_host = true;
+ }
+ else {
+ cmem->use_mapped_host = false;
+ }
+
+ return cmem;
+}
+
+void CUDADevice::generic_copy_to(device_memory &mem)
+{
+ if (!mem.host_pointer || !mem.device_pointer) {
+ return;
+ }
+
+ /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+ * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+ * mem.host_pointer. */
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+ const CUDAContextScope scope(this);
+ cuda_assert(
+ cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+ }
+}
+
+void CUDADevice::generic_free(device_memory &mem)
+{
+ if (mem.device_pointer) {
+ CUDAContextScope scope(this);
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ const CUDAMem &cmem = cuda_mem_map[&mem];
+
+ /* If cmem.use_mapped_host is true, reference counting is used
+ * to safely free a mapped host memory. */
+
+ if (cmem.use_mapped_host) {
+ assert(mem.shared_pointer);
+ if (mem.shared_pointer) {
+ assert(mem.shared_counter > 0);
+ if (--mem.shared_counter == 0) {
+ if (mem.host_pointer == mem.shared_pointer) {
+ mem.host_pointer = 0;
+ }
+ cuMemFreeHost(mem.shared_pointer);
+ mem.shared_pointer = 0;
+ }
+ }
+ map_host_used -= mem.device_size;
+ }
+ else {
+ /* Free device memory. */
+ cuda_assert(cuMemFree(mem.device_pointer));
+ }
+
+ stats.mem_free(mem.device_size);
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ cuda_mem_map.erase(cuda_mem_map.find(&mem));
+ }
+}
+
+void CUDADevice::mem_alloc(device_memory &mem)
+{
+ if (mem.type == MEM_TEXTURE) {
+ assert(!"mem_alloc not supported for textures.");
+ }
+ else if (mem.type == MEM_GLOBAL) {
+ assert(!"mem_alloc not supported for global memory.");
+ }
+ else {
+ generic_alloc(mem);
+ }
+}
+
+void CUDADevice::mem_copy_to(device_memory &mem)
+{
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ global_alloc(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ tex_alloc((device_texture &)mem);
+ }
+ else {
+ if (!mem.device_pointer) {
+ generic_alloc(mem);
+ }
+ generic_copy_to(mem);
+ }
+}
+
+void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+{
+ if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+ assert(!"mem_copy_from not supported for textures.");
+ }
+ else if (mem.host_pointer) {
+ const size_t size = elem * w * h;
+ const size_t offset = elem * y * w;
+
+ if (mem.device_pointer) {
+ const CUDAContextScope scope(this);
+ cuda_assert(cuMemcpyDtoH(
+ (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+ }
+ else {
+ memset((char *)mem.host_pointer + offset, 0, size);
+ }
+ }
+}
+
+void CUDADevice::mem_zero(device_memory &mem)
+{
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+ if (!mem.device_pointer) {
+ return;
+ }
+
+ /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+ * regardless of mem.host_pointer and mem.shared_pointer. */
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+ const CUDAContextScope scope(this);
+ cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+ }
+ else if (mem.host_pointer) {
+ memset(mem.host_pointer, 0, mem.memory_size());
+ }
+}
+
+void CUDADevice::mem_free(device_memory &mem)
+{
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ }
+ else {
+ generic_free(mem);
+ }
+}
+
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+ return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
+{
+ CUDAContextScope scope(this);
+ CUdeviceptr mem;
+ size_t bytes;
+
+ cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+ // assert(bytes == size);
+ cuda_assert(cuMemcpyHtoD(mem, host, size));
+}
+
+void CUDADevice::global_alloc(device_memory &mem)
+{
+ if (mem.is_resident(this)) {
+ generic_alloc(mem);
+ generic_copy_to(mem);
+ }
+
+ const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void CUDADevice::global_free(device_memory &mem)
+{
+ if (mem.is_resident(this) && mem.device_pointer) {
+ generic_free(mem);
+ }
+}
+
+void CUDADevice::tex_alloc(device_texture &mem)
+{
+ CUDAContextScope scope(this);
+
+ /* General variables for both architectures */
+ string bind_name = mem.name;
+ size_t dsize = datatype_size(mem.data_type);
+ size_t size = mem.memory_size();
+
+ CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+ switch (mem.info.extension) {
+ case EXTENSION_REPEAT:
+ address_mode = CU_TR_ADDRESS_MODE_WRAP;
+ break;
+ case EXTENSION_EXTEND:
+ address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+ break;
+ case EXTENSION_CLIP:
+ address_mode = CU_TR_ADDRESS_MODE_BORDER;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ CUfilter_mode filter_mode;
+ if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+ filter_mode = CU_TR_FILTER_MODE_POINT;
+ }
+ else {
+ filter_mode = CU_TR_FILTER_MODE_LINEAR;
+ }
+
+ /* Image Texture Storage */
+ CUarray_format_enum format;
+ switch (mem.data_type) {
+ case TYPE_UCHAR:
+ format = CU_AD_FORMAT_UNSIGNED_INT8;
+ break;
+ case TYPE_UINT16:
+ format = CU_AD_FORMAT_UNSIGNED_INT16;
+ break;
+ case TYPE_UINT:
+ format = CU_AD_FORMAT_UNSIGNED_INT32;
+ break;
+ case TYPE_INT:
+ format = CU_AD_FORMAT_SIGNED_INT32;
+ break;
+ case TYPE_FLOAT:
+ format = CU_AD_FORMAT_FLOAT;
+ break;
+ case TYPE_HALF:
+ format = CU_AD_FORMAT_HALF;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+
+ CUDAMem *cmem = NULL;
+ CUarray array_3d = NULL;
+ size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+ size_t dst_pitch = src_pitch;
+
+ if (!mem.is_resident(this)) {
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ cmem = &cuda_mem_map[&mem];
+ cmem->texobject = 0;
+
+ if (mem.data_depth > 1) {
+ array_3d = (CUarray)mem.device_pointer;
+ cmem->array = array_3d;
+ }
+ else if (mem.data_height > 0) {
+ dst_pitch = align_up(src_pitch, pitch_alignment);
+ }
+ }
+ else if (mem.data_depth > 1) {
+ /* 3D texture using array, there is no API for linear memory. */
+ CUDA_ARRAY3D_DESCRIPTOR desc;
+
+ desc.Width = mem.data_width;
+ desc.Height = mem.data_height;
+ desc.Depth = mem.data_depth;
+ desc.Format = format;
+ desc.NumChannels = mem.data_elements;
+ desc.Flags = 0;
+
+ VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+ if (!array_3d) {
+ return;
+ }
+
+ CUDA_MEMCPY3D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ param.dstArray = array_3d;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = mem.host_pointer;
+ param.srcPitch = src_pitch;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+ param.Depth = mem.data_depth;
+
+ cuda_assert(cuMemcpy3D(&param));
+
+ mem.device_pointer = (device_ptr)array_3d;
+ mem.device_size = size;
+ stats.mem_alloc(size);
+
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ cmem = &cuda_mem_map[&mem];
+ cmem->texobject = 0;
+ cmem->array = array_3d;
+ }
+ else if (mem.data_height > 0) {
+ /* 2D texture, using pitch aligned linear memory. */
+ dst_pitch = align_up(src_pitch, pitch_alignment);
+ size_t dst_size = dst_pitch * mem.data_height;
+
+ cmem = generic_alloc(mem, dst_size - mem.memory_size());
+ if (!cmem) {
+ return;
+ }
+
+ CUDA_MEMCPY2D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+ param.dstDevice = mem.device_pointer;
+ param.dstPitch = dst_pitch;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = mem.host_pointer;
+ param.srcPitch = src_pitch;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+
+ cuda_assert(cuMemcpy2DUnaligned(&param));
+ }
+ else {
+ /* 1D texture, using linear memory. */
+ cmem = generic_alloc(mem);
+ if (!cmem) {
+ return;
+ }
+
+ cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+ }
+
+ /* Resize once */
+ const uint slot = mem.slot;
+ if (slot >= texture_info.size()) {
+ /* Allocate some slots in advance, to reduce amount
+ * of re-allocations. */
+ texture_info.resize(slot + 128);
+ }
+
+ /* Set Mapping and tag that we need to (re-)upload to device */
+ texture_info[slot] = mem.info;
+ need_texture_info = true;
+
+ if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
+ mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+ /* Kepler+, bindless textures. */
+ CUDA_RESOURCE_DESC resDesc;
+ memset(&resDesc, 0, sizeof(resDesc));
+
+ if (array_3d) {
+ resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+ resDesc.res.array.hArray = array_3d;
+ resDesc.flags = 0;
+ }
+ else if (mem.data_height > 0) {
+ resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+ resDesc.res.pitch2D.devPtr = mem.device_pointer;
+ resDesc.res.pitch2D.format = format;
+ resDesc.res.pitch2D.numChannels = mem.data_elements;
+ resDesc.res.pitch2D.height = mem.data_height;
+ resDesc.res.pitch2D.width = mem.data_width;
+ resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+ }
+ else {
+ resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+ resDesc.res.linear.devPtr = mem.device_pointer;
+ resDesc.res.linear.format = format;
+ resDesc.res.linear.numChannels = mem.data_elements;
+ resDesc.res.linear.sizeInBytes = mem.device_size;
+ }
+
+ CUDA_TEXTURE_DESC texDesc;
+ memset(&texDesc, 0, sizeof(texDesc));
+ texDesc.addressMode[0] = address_mode;
+ texDesc.addressMode[1] = address_mode;
+ texDesc.addressMode[2] = address_mode;
+ texDesc.filterMode = filter_mode;
+ texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ cmem = &cuda_mem_map[&mem];
+
+ cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+ texture_info[slot].data = (uint64_t)cmem->texobject;
+ }
+ else {
+ texture_info[slot].data = (uint64_t)mem.device_pointer;
+ }
+}
+
+void CUDADevice::tex_free(device_texture &mem)
+{
+ if (mem.device_pointer) {
+ CUDAContextScope scope(this);
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ const CUDAMem &cmem = cuda_mem_map[&mem];
+
+ if (cmem.texobject) {
+ /* Free bindless texture. */
+ cuTexObjectDestroy(cmem.texobject);
+ }
+
+ if (!mem.is_resident(this)) {
+ /* Do not free memory here, since it was allocated on a different device. */
+ cuda_mem_map.erase(cuda_mem_map.find(&mem));
+ }
+ else if (cmem.array) {
+ /* Free array. */
+ cuArrayDestroy(cmem.array);
+ stats.mem_free(mem.device_size);
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ cuda_mem_map.erase(cuda_mem_map.find(&mem));
+ }
+ else {
+ lock.unlock();
+ generic_free(mem);
+ }
+ }
+}
+
+# if 0
+void CUDADevice::render(DeviceTask &task,
+ RenderTile &rtile,
+ device_vector<KernelWorkTile> &work_tiles)
+{
+ scoped_timer timer(&rtile.buffers->render_time);
+
+ if (have_error())
+ return;
+
+ CUDAContextScope scope(this);
+ CUfunction cuRender;
+
+ /* Get kernel function. */
+ if (rtile.task == RenderTile::BAKE) {
+ cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+ }
+ else {
+ cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
+ }
+
+ if (have_error()) {
+ return;
+ }
+
+ cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
+
+ /* Allocate work tile. */
+ work_tiles.alloc(1);
+
+ KernelWorkTile *wtile = work_tiles.data();
+ wtile->x = rtile.x;
+ wtile->y = rtile.y;
+ wtile->w = rtile.w;
+ wtile->h = rtile.h;
+ wtile->offset = rtile.offset;
+ wtile->stride = rtile.stride;
+ wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
+
+ /* Prepare work size. More step samples render faster, but for now we
+ * remain conservative for GPUs connected to a display to avoid driver
+ * timeouts and display freezing. */
+ int min_blocks, num_threads_per_block;
+ cuda_assert(
+ cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
+ if (!info.display_device) {
+ min_blocks *= 8;
+ }
+
+ uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+ /* Render all samples. */
+ uint start_sample = rtile.start_sample;
+ uint end_sample = rtile.start_sample + rtile.num_samples;
+
+ for (int sample = start_sample; sample < end_sample;) {
+ /* Setup and copy work tile to device. */
+ wtile->start_sample = sample;
+ wtile->num_samples = step_samples;
+ if (task.adaptive_sampling.use) {
+ wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
+ }
+ wtile->num_samples = min(wtile->num_samples, end_sample - sample);
+ work_tiles.copy_to_device();
+
+ CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+ uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+ uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+ /* Launch kernel. */
+ void *args[] = {&d_work_tiles, &total_work_size};
+
+ cuda_assert(
+ cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+ /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+ uint filter_sample = sample + wtile->num_samples - 1;
+ if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+ adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+ }
+
+ cuda_assert(cuCtxSynchronize());
+
+ /* Update progress. */
+ sample += wtile->num_samples;
+ rtile.sample = sample;
+ task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+ if (task.get_cancel()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+ }
+
+ /* Finalize adaptive sampling. */
+ if (task.adaptive_sampling.use) {
+ CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+ adaptive_sampling_post(rtile, wtile, d_work_tiles);
+ cuda_assert(cuCtxSynchronize());
+ task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+ }
+}
+
+void CUDADevice::thread_run(DeviceTask &task)
+{
+ CUDAContextScope scope(this);
+
+ if (task.type == DeviceTask::RENDER) {
+ device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+ /* keep rendering tiles until done */
+ RenderTile tile;
+ DenoisingTask denoising(this, task);
+
+ while (task.acquire_tile(this, tile, task.tile_types)) {
+ if (tile.task == RenderTile::PATH_TRACE) {
+ render(task, tile, work_tiles);
+ }
+ else if (tile.task == RenderTile::BAKE) {
+ render(task, tile, work_tiles);
+ }
+
+ task.release_tile(tile);
+
+ if (task.get_cancel()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+ }
+
+ work_tiles.free();
+ }
+}
+# endif
+
+unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
+{
+ return make_unique<CUDADeviceQueue>(this);
+}
+
+bool CUDADevice::should_use_graphics_interop()
+{
+ /* Check whether this device is part of OpenGL context.
+ *
+ * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
+ * possible, but from the empiric measurements it can be considerably slower than using naive
+ * pixels copy. */
+
+ CUDAContextScope scope(this);
+
+ int num_all_devices = 0;
+ cuda_assert(cuDeviceGetCount(&num_all_devices));
+
+ if (num_all_devices == 0) {
+ return false;
+ }
+
+ vector<CUdevice> gl_devices(num_all_devices);
+ uint num_gl_devices;
+ cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
+
+ for (CUdevice gl_device : gl_devices) {
+ if (gl_device == cuDevice) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+int CUDADevice::get_num_multiprocessors()
+{
+ return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+}
+
+int CUDADevice::get_max_num_threads_per_multiprocessor()
+{
+ return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
+}
+
+bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
+{
+ CUDAContextScope scope(this);
+
+ return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
+}
+
+int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value)
+{
+ int value = 0;
+ if (!get_device_attribute(attribute, &value)) {
+ return default_value;
+ }
+ return value;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
new file mode 100644
index 00000000000..6b27db54ab4
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/cuda/kernel.h"
+# include "device/cuda/queue.h"
+# include "device/cuda/util.h"
+# include "device/device.h"
+
+# include "util/util_map.h"
+
+# ifdef WITH_CUDA_DYNLOAD
+# include "cuew.h"
+# else
+# include "util/util_opengl.h"
+# include <cuda.h>
+# include <cudaGL.h>
+# endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class CUDADevice : public Device {
+
+ friend class CUDAContextScope;
+
+ public:
+ CUdevice cuDevice;
+ CUcontext cuContext;
+ CUmodule cuModule;
+ size_t device_texture_headroom;
+ size_t device_working_headroom;
+ bool move_texture_to_host;
+ size_t map_host_used;
+ size_t map_host_limit;
+ int can_map_host;
+ int pitch_alignment;
+ int cuDevId;
+ int cuDevArchitecture;
+ bool first_error;
+
+ struct CUDAMem {
+ CUDAMem() : texobject(0), array(0), use_mapped_host(false)
+ {
+ }
+
+ CUtexObject texobject;
+ CUarray array;
+
+ /* If true, a mapped host memory in shared_pointer is being used. */
+ bool use_mapped_host;
+ };
+ typedef map<device_memory *, CUDAMem> CUDAMemMap;
+ CUDAMemMap cuda_mem_map;
+ thread_mutex cuda_mem_map_mutex;
+
+ /* Bindless Textures */
+ device_vector<TextureInfo> texture_info;
+ bool need_texture_info;
+
+ CUDADeviceKernels kernels;
+
+ static bool have_precompiled_kernels();
+
+ virtual bool show_samples() const override;
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+ void set_error(const string &error) override;
+
+ CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+ virtual ~CUDADevice();
+
+ bool support_device(const uint /*kernel_features*/);
+
+ bool check_peer_access(Device *peer_device) override;
+
+ bool use_adaptive_compilation();
+
+ virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+ string compile_kernel(const uint kernel_features,
+ const char *name,
+ const char *base = "cuda",
+ bool force_ptx = false);
+
+ virtual bool load_kernels(const uint kernel_features) override;
+
+ void reserve_local_memory(const uint kernel_features);
+
+ void init_host_memory();
+
+ void load_texture_info();
+
+ void move_textures_to_host(size_t size, bool for_texture);
+
+ CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+ void generic_copy_to(device_memory &mem);
+
+ void generic_free(device_memory &mem);
+
+ void mem_alloc(device_memory &mem) override;
+
+ void mem_copy_to(device_memory &mem) override;
+
+ void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+
+ void mem_zero(device_memory &mem) override;
+
+ void mem_free(device_memory &mem) override;
+
+ device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+ virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+ void global_alloc(device_memory &mem);
+
+ void global_free(device_memory &mem);
+
+ void tex_alloc(device_texture &mem);
+
+ void tex_free(device_texture &mem);
+
+ virtual bool should_use_graphics_interop() override;
+
+ virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+ int get_num_multiprocessors();
+ int get_max_num_threads_per_multiprocessor();
+
+ protected:
+ bool get_device_attribute(CUdevice_attribute attribute, int *value);
+ int get_device_default_attribute(CUdevice_attribute attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp
new file mode 100644
index 00000000000..e8ca8b90eae
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/cuda/graphics_interop.h"
+
+# include "device/cuda/device_impl.h"
+# include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDADeviceGraphicsInterop::CUDADeviceGraphicsInterop(CUDADeviceQueue *queue)
+ : queue_(queue), device_(static_cast<CUDADevice *>(queue->device))
+{
+}
+
+CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop()
+{
+ CUDAContextScope scope(device_);
+
+ if (cu_graphics_resource_) {
+ cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+ }
+}
+
+void CUDADeviceGraphicsInterop::set_destination(
+ const DeviceGraphicsInteropDestination &destination)
+{
+ const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+
+ need_clear_ = destination.need_clear;
+
+ if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+ return;
+ }
+
+ CUDAContextScope scope(device_);
+
+ if (cu_graphics_resource_) {
+ cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+ }
+
+ const CUresult result = cuGraphicsGLRegisterBuffer(
+ &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+ if (result != CUDA_SUCCESS) {
+ LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result);
+ }
+
+ opengl_pbo_id_ = destination.opengl_pbo_id;
+ buffer_area_ = new_buffer_area;
+}
+
+device_ptr CUDADeviceGraphicsInterop::map()
+{
+ if (!cu_graphics_resource_) {
+ return 0;
+ }
+
+ CUDAContextScope scope(device_);
+
+ CUdeviceptr cu_buffer;
+ size_t bytes;
+
+ cuda_device_assert(device_, cuGraphicsMapResources(1, &cu_graphics_resource_, queue_->stream()));
+ cuda_device_assert(
+ device_, cuGraphicsResourceGetMappedPointer(&cu_buffer, &bytes, cu_graphics_resource_));
+
+ if (need_clear_) {
+ cuda_device_assert(
+ device_, cuMemsetD8Async(static_cast<CUdeviceptr>(cu_buffer), 0, bytes, queue_->stream()));
+
+ need_clear_ = false;
+ }
+
+ return static_cast<device_ptr>(cu_buffer);
+}
+
+void CUDADeviceGraphicsInterop::unmap()
+{
+ CUDAContextScope scope(device_);
+
+ cuda_device_assert(device_,
+ cuGraphicsUnmapResources(1, &cu_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h
new file mode 100644
index 00000000000..8a70c8aa71d
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/device_graphics_interop.h"
+
+# ifdef WITH_CUDA_DYNLOAD
+# include "cuew.h"
+# else
+# include <cuda.h>
+# endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class CUDADeviceQueue;
+
+class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+ explicit CUDADeviceGraphicsInterop(CUDADeviceQueue *queue);
+
+ CUDADeviceGraphicsInterop(const CUDADeviceGraphicsInterop &other) = delete;
+ CUDADeviceGraphicsInterop(CUDADeviceGraphicsInterop &&other) noexcept = delete;
+
+ ~CUDADeviceGraphicsInterop();
+
+ CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete;
+ CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete;
+
+ virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+
+ virtual device_ptr map() override;
+ virtual void unmap() override;
+
+ protected:
+ CUDADeviceQueue *queue_ = nullptr;
+ CUDADevice *device_ = nullptr;
+
+ /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+ uint opengl_pbo_id_ = 0;
+ /* Buffer area in pixels of the corresponding PBO. */
+ int64_t buffer_area_ = 0;
+
+ /* The destination was requested to be cleared. */
+ bool need_clear_ = false;
+
+ CUgraphicsResource cu_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp
new file mode 100644
index 00000000000..a4a7bfabce0
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/cuda/kernel.h"
+# include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void CUDADeviceKernels::load(CUDADevice *device)
+{
+ CUmodule cuModule = device->cuModule;
+
+ for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+ CUDADeviceKernel &kernel = kernels_[i];
+
+ /* No mega-kernel used for GPU. */
+ if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ continue;
+ }
+
+ const std::string function_name = std::string("kernel_gpu_") +
+ device_kernel_as_string((DeviceKernel)i);
+ cuda_device_assert(device,
+ cuModuleGetFunction(&kernel.function, cuModule, function_name.c_str()));
+
+ if (kernel.function) {
+ cuda_device_assert(device, cuFuncSetCacheConfig(kernel.function, CU_FUNC_CACHE_PREFER_L1));
+
+ cuda_device_assert(
+ device,
+ cuOccupancyMaxPotentialBlockSize(
+ &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, NULL, 0, 0));
+ }
+ else {
+ LOG(ERROR) << "Unable to load kernel " << function_name;
+ }
+ }
+
+ loaded = true;
+}
+
+const CUDADeviceKernel &CUDADeviceKernels::get(DeviceKernel kernel) const
+{
+ return kernels_[(int)kernel];
+}
+
+bool CUDADeviceKernels::available(DeviceKernel kernel) const
+{
+ return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA*/
diff --git a/intern/cycles/device/cuda/kernel.h b/intern/cycles/device/cuda/kernel.h
new file mode 100644
index 00000000000..b489547a350
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+# include "device/device_kernel.h"
+
+# ifdef WITH_CUDA_DYNLOAD
+# include "cuew.h"
+# else
+# include <cuda.h>
+# endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* CUDA kernel and associate occupancy information. */
+class CUDADeviceKernel {
+ public:
+ CUfunction function = nullptr;
+
+ int num_threads_per_block = 0;
+ int min_blocks = 0;
+};
+
+/* Cache of CUDA kernels for each DeviceKernel. */
+class CUDADeviceKernels {
+ public:
+ void load(CUDADevice *device);
+ const CUDADeviceKernel &get(DeviceKernel kernel) const;
+ bool available(DeviceKernel kernel) const;
+
+ protected:
+ CUDADeviceKernel kernels_[DEVICE_KERNEL_NUM];
+ bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
new file mode 100644
index 00000000000..b7f86c10553
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/cuda/queue.h"
+
+# include "device/cuda/device_impl.h"
+# include "device/cuda/graphics_interop.h"
+# include "device/cuda/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
+ : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
+{
+ const CUDAContextScope scope(cuda_device_);
+ cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
+}
+
+CUDADeviceQueue::~CUDADeviceQueue()
+{
+ const CUDAContextScope scope(cuda_device_);
+ cuStreamDestroy(cuda_stream_);
+}
+
+int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+ int num_states = max(cuda_device_->get_num_multiprocessors() *
+ cuda_device_->get_max_num_threads_per_multiprocessor() * 16,
+ 1048576);
+
+ const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+ if (factor_str) {
+ num_states = max((int)(num_states * atof(factor_str)), 1024);
+ }
+
+ VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+ << string_human_readable_size(num_states * state_size);
+
+ return num_states;
+}
+
+int CUDADeviceQueue::num_concurrent_busy_states() const
+{
+ const int max_num_threads = cuda_device_->get_num_multiprocessors() *
+ cuda_device_->get_max_num_threads_per_multiprocessor();
+
+ if (max_num_threads == 0) {
+ return 65536;
+ }
+
+ return 4 * max_num_threads;
+}
+
+void CUDADeviceQueue::init_execution()
+{
+ /* Synchronize all textures and memory copies before executing task. */
+ CUDAContextScope scope(cuda_device_);
+ cuda_device_->load_texture_info();
+ cuda_device_assert(cuda_device_, cuCtxSynchronize());
+
+ debug_init_execution();
+}
+
+bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+ return cuda_device_->kernels.available(kernel);
+}
+
+bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+ if (cuda_device_->have_error()) {
+ return false;
+ }
+
+ debug_enqueue(kernel, work_size);
+
+ const CUDAContextScope scope(cuda_device_);
+ const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
+
+ /* Compute kernel launch parameters. */
+ const int num_threads_per_block = cuda_kernel.num_threads_per_block;
+ const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+ int shared_mem_bytes = 0;
+
+ switch (kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+ /* See parall_active_index.h for why this amount of shared memory is needed. */
+ shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+ break;
+
+ default:
+ break;
+ }
+
+ /* Launch kernel. */
+ cuda_device_assert(cuda_device_,
+ cuLaunchKernel(cuda_kernel.function,
+ num_blocks,
+ 1,
+ 1,
+ num_threads_per_block,
+ 1,
+ 1,
+ shared_mem_bytes,
+ cuda_stream_,
+ args,
+ 0));
+
+ return !(cuda_device_->have_error());
+}
+
+bool CUDADeviceQueue::synchronize()
+{
+ if (cuda_device_->have_error()) {
+ return false;
+ }
+
+ const CUDAContextScope scope(cuda_device_);
+ cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+ debug_synchronize();
+
+ return !(cuda_device_->have_error());
+}
+
+void CUDADeviceQueue::zero_to_device(device_memory &mem)
+{
+ assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+ if (mem.memory_size() == 0) {
+ return;
+ }
+
+ /* Allocate on demand. */
+ if (mem.device_pointer == 0) {
+ cuda_device_->mem_alloc(mem);
+ }
+
+ /* Zero memory on device. */
+ assert(mem.device_pointer != 0);
+
+ const CUDAContextScope scope(cuda_device_);
+ cuda_device_assert(
+ cuda_device_,
+ cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_to_device(device_memory &mem)
+{
+ assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+ if (mem.memory_size() == 0) {
+ return;
+ }
+
+ /* Allocate on demand. */
+ if (mem.device_pointer == 0) {
+ cuda_device_->mem_alloc(mem);
+ }
+
+ assert(mem.device_pointer != 0);
+ assert(mem.host_pointer != nullptr);
+
+ /* Copy memory to device. */
+ const CUDAContextScope scope(cuda_device_);
+ cuda_device_assert(
+ cuda_device_,
+ cuMemcpyHtoDAsync(
+ (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_from_device(device_memory &mem)
+{
+ assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+ if (mem.memory_size() == 0) {
+ return;
+ }
+
+ assert(mem.device_pointer != 0);
+ assert(mem.host_pointer != nullptr);
+
+ /* Copy memory from device. */
+ const CUDAContextScope scope(cuda_device_);
+ cuda_device_assert(
+ cuda_device_,
+ cuMemcpyDtoHAsync(
+ mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+}
+
+unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
+{
+ return make_unique<CUDADeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
new file mode 100644
index 00000000000..62e3aa3d6c2
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+# include "device/device_kernel.h"
+# include "device/device_memory.h"
+# include "device/device_queue.h"
+
+# include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class device_memory;
+
+/* Base class for CUDA queues. */
+class CUDADeviceQueue : public DeviceQueue {
+ public:
+ CUDADeviceQueue(CUDADevice *device);
+ ~CUDADeviceQueue();
+
+ virtual int num_concurrent_states(const size_t state_size) const override;
+ virtual int num_concurrent_busy_states() const override;
+
+ virtual void init_execution() override;
+
+ virtual bool kernel_available(DeviceKernel kernel) const override;
+
+ virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+ virtual bool synchronize() override;
+
+ virtual void zero_to_device(device_memory &mem) override;
+ virtual void copy_to_device(device_memory &mem) override;
+ virtual void copy_from_device(device_memory &mem) override;
+
+ virtual CUstream stream()
+ {
+ return cuda_stream_;
+ }
+
+ virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+ CUDADevice *cuda_device_;
+ CUstream cuda_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.cpp b/intern/cycles/device/cuda/util.cpp
new file mode 100644
index 00000000000..8f657cc10fe
--- /dev/null
+++ b/intern/cycles/device/cuda/util.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/cuda/util.h"
+# include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
+{
+ cuda_device_assert(device, cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+ cuda_device_assert(device, cuCtxPopCurrent(NULL));
+}
+
+# ifndef WITH_CUDA_DYNLOAD
+const char *cuewErrorString(CUresult result)
+{
+ /* We can only give error code here without major code duplication, that
+ * should be enough since dynamic loading is only being disabled by folks
+ * who knows what they're doing anyway.
+ *
+ * NOTE: Avoid call from several threads.
+ */
+ static string error;
+ error = string_printf("%d", result);
+ return error.c_str();
+}
+
+const char *cuewCompilerPath()
+{
+ return CYCLES_CUDA_NVCC_EXECUTABLE;
+}
+
+int cuewCompilerVersion()
+{
+ return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+}
+# endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.h b/intern/cycles/device/cuda/util.h
new file mode 100644
index 00000000000..a0898094c08
--- /dev/null
+++ b/intern/cycles/device/cuda/util.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+# ifdef WITH_CUDA_DYNLOAD
+# include "cuew.h"
+# else
+# include <cuda.h>
+# endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+ public:
+ CUDAContextScope(CUDADevice *device);
+ ~CUDAContextScope();
+
+ private:
+ CUDADevice *device;
+};
+
+/* Utility for checking return values of CUDA function calls. */
+# define cuda_device_assert(cuda_device, stmt) \
+ { \
+ CUresult result = stmt; \
+ if (result != CUDA_SUCCESS) { \
+ const char *name = cuewErrorString(result); \
+ cuda_device->set_error( \
+ string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+ } \
+ } \
+ (void)0
+
+# define cuda_assert(stmt) cuda_device_assert(this, stmt)
+
+# ifndef WITH_CUDA_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked CUDA at all. */
+const char *cuewErrorString(CUresult result);
+const char *cuewCompilerPath();
+int cuewCompilerVersion();
+# endif /* WITH_CUDA_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index ed53fbb54ae..6ccedcf54ef 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -20,7 +20,13 @@
#include "bvh/bvh2.h"
#include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/device.h"
+#include "device/cuda/device.h"
+#include "device/dummy/device.h"
+#include "device/multi/device.h"
+#include "device/optix/device.h"
#include "util/util_foreach.h"
#include "util/util_half.h"
@@ -38,332 +44,15 @@ CCL_NAMESPACE_BEGIN
bool Device::need_types_update = true;
bool Device::need_devices_update = true;
thread_mutex Device::device_mutex;
-vector<DeviceInfo> Device::opencl_devices;
vector<DeviceInfo> Device::cuda_devices;
vector<DeviceInfo> Device::optix_devices;
vector<DeviceInfo> Device::cpu_devices;
-vector<DeviceInfo> Device::network_devices;
uint Device::devices_initialized_mask = 0;
-/* Device Requested Features */
-
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features)
-{
- os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl;
- os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
- /* TODO(sergey): Decode bitflag into list of names. */
- os << "Nodes features: " << requested_features.nodes_features << std::endl;
- os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl;
- os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion)
- << std::endl;
- os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion)
- << std::endl;
- os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl;
- os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl;
- os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl;
- os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched)
- << std::endl;
- os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation)
- << std::endl;
- os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent)
- << std::endl;
- os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled)
- << std::endl;
- os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl;
- os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement)
- << std::endl;
- os << "Use Background Light: " << string_from_bool(requested_features.use_background_light)
- << std::endl;
- return os;
-}
-
/* Device */
Device::~Device() noexcept(false)
{
- if (!background) {
- if (vertex_buffer != 0) {
- glDeleteBuffers(1, &vertex_buffer);
- }
- if (fallback_shader_program != 0) {
- glDeleteProgram(fallback_shader_program);
- }
- }
-}
-
-/* TODO move shaders to standalone .glsl file. */
-const char *FALLBACK_VERTEX_SHADER =
- "#version 330\n"
- "uniform vec2 fullscreen;\n"
- "in vec2 texCoord;\n"
- "in vec2 pos;\n"
- "out vec2 texCoord_interp;\n"
- "\n"
- "vec2 normalize_coordinates()\n"
- "{\n"
- " return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
- "}\n"
- "\n"
- "void main()\n"
- "{\n"
- " gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
- " texCoord_interp = texCoord;\n"
- "}\n\0";
-
-const char *FALLBACK_FRAGMENT_SHADER =
- "#version 330\n"
- "uniform sampler2D image_texture;\n"
- "in vec2 texCoord_interp;\n"
- "out vec4 fragColor;\n"
- "\n"
- "void main()\n"
- "{\n"
- " fragColor = texture(image_texture, texCoord_interp);\n"
- "}\n\0";
-
-static void shader_print_errors(const char *task, const char *log, const char *code)
-{
- LOG(ERROR) << "Shader: " << task << " error:";
- LOG(ERROR) << "===== shader string ====";
-
- stringstream stream(code);
- string partial;
-
- int line = 1;
- while (getline(stream, partial, '\n')) {
- if (line < 10) {
- LOG(ERROR) << " " << line << " " << partial;
- }
- else {
- LOG(ERROR) << line << " " << partial;
- }
- line++;
- }
- LOG(ERROR) << log;
-}
-
-static int bind_fallback_shader(void)
-{
- GLint status;
- GLchar log[5000];
- GLsizei length = 0;
- GLuint program = 0;
-
- struct Shader {
- const char *source;
- GLenum type;
- } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
- {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
-
- program = glCreateProgram();
-
- for (int i = 0; i < 2; i++) {
- GLuint shader = glCreateShader(shaders[i].type);
-
- string source_str = shaders[i].source;
- const char *c_str = source_str.c_str();
-
- glShaderSource(shader, 1, &c_str, NULL);
- glCompileShader(shader);
-
- glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
-
- if (!status) {
- glGetShaderInfoLog(shader, sizeof(log), &length, log);
- shader_print_errors("compile", log, c_str);
- return 0;
- }
-
- glAttachShader(program, shader);
- }
-
- /* Link output. */
- glBindFragDataLocation(program, 0, "fragColor");
-
- /* Link and error check. */
- glLinkProgram(program);
-
- glGetProgramiv(program, GL_LINK_STATUS, &status);
- if (!status) {
- glGetShaderInfoLog(program, sizeof(log), &length, log);
- shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
- shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
- return 0;
- }
-
- return program;
-}
-
-bool Device::bind_fallback_display_space_shader(const float width, const float height)
-{
- if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
- return false;
- }
-
- if (fallback_status == FALLBACK_SHADER_STATUS_NONE) {
- fallback_shader_program = bind_fallback_shader();
- fallback_status = FALLBACK_SHADER_STATUS_ERROR;
-
- if (fallback_shader_program == 0) {
- return false;
- }
-
- glUseProgram(fallback_shader_program);
- image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
- if (image_texture_location < 0) {
- LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
- return false;
- }
-
- fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
- if (fullscreen_location < 0) {
- LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
- return false;
- }
-
- fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
- }
-
- /* Run this every time. */
- glUseProgram(fallback_shader_program);
- glUniform1i(image_texture_location, 0);
- glUniform2f(fullscreen_location, width, height);
- return true;
-}
-
-void Device::draw_pixels(device_memory &rgba,
- int y,
- int w,
- int h,
- int width,
- int height,
- int dx,
- int dy,
- int dw,
- int dh,
- bool transparent,
- const DeviceDrawParams &draw_params)
-{
- const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-
- assert(rgba.type == MEM_PIXELS);
- mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
-
- GLuint texid;
- glActiveTexture(GL_TEXTURE0);
- glGenTextures(1, &texid);
- glBindTexture(GL_TEXTURE_2D, texid);
-
- if (rgba.data_type == TYPE_HALF) {
- GLhalf *data_pointer = (GLhalf *)rgba.host_pointer;
- data_pointer += 4 * y * w;
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
- }
- else {
- uint8_t *data_pointer = (uint8_t *)rgba.host_pointer;
- data_pointer += 4 * y * w;
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
- }
-
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
- if (transparent) {
- glEnable(GL_BLEND);
- glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
- }
-
- GLint shader_program;
- if (use_fallback_shader) {
- if (!bind_fallback_display_space_shader(dw, dh)) {
- return;
- }
- shader_program = fallback_shader_program;
- }
- else {
- draw_params.bind_display_space_shader_cb();
- glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
- }
-
- if (!vertex_buffer) {
- glGenBuffers(1, &vertex_buffer);
- }
-
- glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
- /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered
- */
- glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
- float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
- if (vpointer) {
- /* texture coordinate - vertex pair */
- vpointer[0] = 0.0f;
- vpointer[1] = 0.0f;
- vpointer[2] = dx;
- vpointer[3] = dy;
-
- vpointer[4] = 1.0f;
- vpointer[5] = 0.0f;
- vpointer[6] = (float)width + dx;
- vpointer[7] = dy;
-
- vpointer[8] = 1.0f;
- vpointer[9] = 1.0f;
- vpointer[10] = (float)width + dx;
- vpointer[11] = (float)height + dy;
-
- vpointer[12] = 0.0f;
- vpointer[13] = 1.0f;
- vpointer[14] = dx;
- vpointer[15] = (float)height + dy;
-
- if (vertex_buffer) {
- glUnmapBuffer(GL_ARRAY_BUFFER);
- }
- }
-
- GLuint vertex_array_object;
- GLuint position_attribute, texcoord_attribute;
-
- glGenVertexArrays(1, &vertex_array_object);
- glBindVertexArray(vertex_array_object);
-
- texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
- position_attribute = glGetAttribLocation(shader_program, "pos");
-
- glEnableVertexAttribArray(texcoord_attribute);
- glEnableVertexAttribArray(position_attribute);
-
- glVertexAttribPointer(
- texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
- glVertexAttribPointer(position_attribute,
- 2,
- GL_FLOAT,
- GL_FALSE,
- 4 * sizeof(float),
- (const GLvoid *)(sizeof(float) * 2));
-
- glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
- if (vertex_buffer) {
- glBindBuffer(GL_ARRAY_BUFFER, 0);
- }
-
- if (use_fallback_shader) {
- glUseProgram(0);
- }
- else {
- draw_params.unbind_display_space_shader_cb();
- }
-
- glDeleteVertexArrays(1, &vertex_array_object);
- glBindTexture(GL_TEXTURE_2D, 0);
- glDeleteTextures(1, &texid);
-
- if (transparent) {
- glDisable(GL_BLEND);
- }
}
void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
@@ -379,14 +68,14 @@ void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
}
}
-Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
{
#ifdef WITH_MULTI
if (!info.multi_devices.empty()) {
/* Always create a multi device when info contains multiple devices.
* This is done so that the type can still be e.g. DEVICE_CPU to indicate
* that it is a homogeneous collection of devices, which simplifies checks. */
- return device_multi_create(info, stats, profiler, background);
+ return device_multi_create(info, stats, profiler);
}
#endif
@@ -394,29 +83,18 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
switch (info.type) {
case DEVICE_CPU:
- device = device_cpu_create(info, stats, profiler, background);
+ device = device_cpu_create(info, stats, profiler);
break;
#ifdef WITH_CUDA
case DEVICE_CUDA:
if (device_cuda_init())
- device = device_cuda_create(info, stats, profiler, background);
+ device = device_cuda_create(info, stats, profiler);
break;
#endif
#ifdef WITH_OPTIX
case DEVICE_OPTIX:
if (device_optix_init())
- device = device_optix_create(info, stats, profiler, background);
- break;
-#endif
-#ifdef WITH_NETWORK
- case DEVICE_NETWORK:
- device = device_network_create(info, stats, profiler, "127.0.0.1");
- break;
-#endif
-#ifdef WITH_OPENCL
- case DEVICE_OPENCL:
- if (device_opencl_init())
- device = device_opencl_create(info, stats, profiler, background);
+ device = device_optix_create(info, stats, profiler);
break;
#endif
default:
@@ -424,7 +102,7 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
}
if (device == NULL) {
- device = device_dummy_create(info, stats, profiler, background);
+ device = device_dummy_create(info, stats, profiler);
}
return device;
@@ -438,10 +116,6 @@ DeviceType Device::type_from_string(const char *name)
return DEVICE_CUDA;
else if (strcmp(name, "OPTIX") == 0)
return DEVICE_OPTIX;
- else if (strcmp(name, "OPENCL") == 0)
- return DEVICE_OPENCL;
- else if (strcmp(name, "NETWORK") == 0)
- return DEVICE_NETWORK;
else if (strcmp(name, "MULTI") == 0)
return DEVICE_MULTI;
@@ -456,10 +130,6 @@ string Device::string_from_type(DeviceType type)
return "CUDA";
else if (type == DEVICE_OPTIX)
return "OPTIX";
- else if (type == DEVICE_OPENCL)
- return "OPENCL";
- else if (type == DEVICE_NETWORK)
- return "NETWORK";
else if (type == DEVICE_MULTI)
return "MULTI";
@@ -476,12 +146,6 @@ vector<DeviceType> Device::available_types()
#ifdef WITH_OPTIX
types.push_back(DEVICE_OPTIX);
#endif
-#ifdef WITH_OPENCL
- types.push_back(DEVICE_OPENCL);
-#endif
-#ifdef WITH_NETWORK
- types.push_back(DEVICE_NETWORK);
-#endif
return types;
}
@@ -493,20 +157,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
thread_scoped_lock lock(device_mutex);
vector<DeviceInfo> devices;
-#ifdef WITH_OPENCL
- if (mask & DEVICE_MASK_OPENCL) {
- if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
- if (device_opencl_init()) {
- device_opencl_info(opencl_devices);
- }
- devices_initialized_mask |= DEVICE_MASK_OPENCL;
- }
- foreach (DeviceInfo &info, opencl_devices) {
- devices.push_back(info);
- }
- }
-#endif
-
#if defined(WITH_CUDA) || defined(WITH_OPTIX)
if (mask & (DEVICE_MASK_CUDA | DEVICE_MASK_OPTIX)) {
if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
@@ -547,18 +197,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
}
}
-#ifdef WITH_NETWORK
- if (mask & DEVICE_MASK_NETWORK) {
- if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
- device_network_info(network_devices);
- devices_initialized_mask |= DEVICE_MASK_NETWORK;
- }
- foreach (DeviceInfo &info, network_devices) {
- devices.push_back(info);
- }
- }
-#endif
-
return devices;
}
@@ -580,15 +218,6 @@ string Device::device_capabilities(uint mask)
capabilities += device_cpu_capabilities() + "\n";
}
-#ifdef WITH_OPENCL
- if (mask & DEVICE_MASK_OPENCL) {
- if (device_opencl_init()) {
- capabilities += "\nOpenCL device capabilities:\n";
- capabilities += device_opencl_capabilities();
- }
- }
-#endif
-
#ifdef WITH_CUDA
if (mask & DEVICE_MASK_CUDA) {
if (device_cuda_init()) {
@@ -613,16 +242,13 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
}
DeviceInfo info;
- info.type = subdevices.front().type;
+ info.type = DEVICE_NONE;
info.id = "MULTI";
info.description = "Multi Device";
info.num = 0;
info.has_half_images = true;
info.has_nanovdb = true;
- info.has_volume_decoupled = true;
- info.has_branched_path = true;
- info.has_adaptive_stop_per_sample = true;
info.has_osl = true;
info.has_profiling = true;
info.has_peer_memory = false;
@@ -660,16 +286,16 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
info.id += device.id;
/* Set device type to MULTI if subdevices are not of a common type. */
- if (device.type != info.type) {
+ if (info.type == DEVICE_NONE) {
+ info.type = device.type;
+ }
+ else if (device.type != info.type) {
info.type = DEVICE_MULTI;
}
/* Accumulate device info. */
info.has_half_images &= device.has_half_images;
info.has_nanovdb &= device.has_nanovdb;
- info.has_volume_decoupled &= device.has_volume_decoupled;
- info.has_branched_path &= device.has_branched_path;
- info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
info.has_osl &= device.has_osl;
info.has_profiling &= device.has_profiling;
info.has_peer_memory |= device.has_peer_memory;
@@ -689,60 +315,32 @@ void Device::free_memory()
devices_initialized_mask = 0;
cuda_devices.free_memory();
optix_devices.free_memory();
- opencl_devices.free_memory();
cpu_devices.free_memory();
- network_devices.free_memory();
}
-/* DeviceInfo */
-
-void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type)
+unique_ptr<DeviceQueue> Device::gpu_queue_create()
{
- assert(denoising_devices.empty());
-
- if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) {
- vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
- if (!optix_devices.empty()) {
- /* Convert to a special multi device with separate denoising devices. */
- if (multi_devices.empty()) {
- multi_devices.push_back(*this);
- }
-
- /* Try to use the same physical devices for denoising. */
- for (const DeviceInfo &cuda_device : multi_devices) {
- if (cuda_device.type == DEVICE_CUDA) {
- for (const DeviceInfo &optix_device : optix_devices) {
- if (cuda_device.num == optix_device.num) {
- id += optix_device.id;
- denoising_devices.push_back(optix_device);
- break;
- }
- }
- }
- }
-
- if (denoising_devices.empty()) {
- /* Simply use the first available OptiX device. */
- const DeviceInfo optix_device = optix_devices.front();
- id += optix_device.id; /* Uniquely identify this special multi device. */
- denoising_devices.push_back(optix_device);
- }
+ LOG(FATAL) << "Device does not support queues.";
+ return nullptr;
+}
- denoisers = denoiser_type;
- }
- }
- else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) {
- /* Convert to a special multi device with separate denoising devices. */
- if (multi_devices.empty()) {
- multi_devices.push_back(*this);
- }
+const CPUKernels *Device::get_cpu_kernels() const
+{
+ LOG(FATAL) << "Device does not support CPU kernels.";
+ return nullptr;
+}
- /* Add CPU denoising devices. */
- DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front();
- denoising_devices.push_back(cpu_device);
+void Device::get_cpu_kernel_thread_globals(
+ vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/)
+{
+ LOG(FATAL) << "Device does not support CPU kernels.";
+}
- denoisers = denoiser_type;
- }
+void *Device::get_cpu_osl_memory()
+{
+ return nullptr;
}
+/* DeviceInfo */
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ecf79bcdfa6..399d5eb91df 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -21,31 +21,34 @@
#include "bvh/bvh_params.h"
+#include "device/device_denoise.h"
#include "device/device_memory.h"
-#include "device/device_task.h"
+#include "util/util_function.h"
#include "util/util_list.h"
+#include "util/util_logging.h"
#include "util/util_stats.h"
#include "util/util_string.h"
#include "util/util_texture.h"
#include "util/util_thread.h"
#include "util/util_types.h"
+#include "util/util_unique_ptr.h"
#include "util/util_vector.h"
CCL_NAMESPACE_BEGIN
class BVH;
+class DeviceQueue;
class Progress;
-class RenderTile;
+class CPUKernels;
+class CPUKernelThreadGlobals;
/* Device Types */
enum DeviceType {
DEVICE_NONE = 0,
DEVICE_CPU,
- DEVICE_OPENCL,
DEVICE_CUDA,
- DEVICE_NETWORK,
DEVICE_MULTI,
DEVICE_OPTIX,
DEVICE_DUMMY,
@@ -53,20 +56,11 @@ enum DeviceType {
enum DeviceTypeMask {
DEVICE_MASK_CPU = (1 << DEVICE_CPU),
- DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
- DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
DEVICE_MASK_ALL = ~0
};
-enum DeviceKernelStatus {
- DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
- DEVICE_KERNEL_USING_FEATURE_KERNEL,
- DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
- DEVICE_KERNEL_UNKNOWN,
-};
-
#define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)
class DeviceInfo {
@@ -75,20 +69,16 @@ class DeviceInfo {
string description;
string id; /* used for user preferences, should stay fixed with changing hardware config */
int num;
- bool display_device; /* GPU is used as a display device. */
- bool has_half_images; /* Support half-float textures. */
- bool has_nanovdb; /* Support NanoVDB volumes. */
- bool has_volume_decoupled; /* Decoupled volume shading. */
- bool has_branched_path; /* Supports branched path tracing. */
- bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */
- bool has_osl; /* Support Open Shading Language. */
- bool use_split_kernel; /* Use split or mega kernel. */
- bool has_profiling; /* Supports runtime collection of profiling info. */
- bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
- DenoiserTypeMask denoisers; /* Supported denoiser types. */
+ bool display_device; /* GPU is used as a display device. */
+ bool has_nanovdb; /* Support NanoVDB volumes. */
+ bool has_half_images; /* Support half-float textures. */
+ bool has_osl; /* Support Open Shading Language. */
+ bool has_profiling; /* Supports runtime collection of profiling info. */
+ bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
+ bool has_gpu_queue; /* Device supports GPU queue. */
+ DenoiserTypeMask denoisers; /* Supported denoiser types. */
int cpu_threads;
vector<DeviceInfo> multi_devices;
- vector<DeviceInfo> denoising_devices;
string error_msg;
DeviceInfo()
@@ -100,227 +90,35 @@ class DeviceInfo {
display_device = false;
has_half_images = false;
has_nanovdb = false;
- has_volume_decoupled = false;
- has_branched_path = true;
- has_adaptive_stop_per_sample = false;
has_osl = false;
- use_split_kernel = false;
has_profiling = false;
has_peer_memory = false;
+ has_gpu_queue = false;
denoisers = DENOISER_NONE;
}
- bool operator==(const DeviceInfo &info)
+ bool operator==(const DeviceInfo &info) const
{
/* Multiple Devices with the same ID would be very bad. */
assert(id != info.id ||
(type == info.type && num == info.num && description == info.description));
return id == info.id;
}
-
- /* Add additional devices needed for the specified denoiser. */
- void add_denoising_devices(DenoiserType denoiser_type);
-};
-
-class DeviceRequestedFeatures {
- public:
- /* Use experimental feature set. */
- bool experimental;
-
- /* Selective nodes compilation. */
-
- /* Identifier of a node group up to which all the nodes needs to be
- * compiled in. Nodes from higher group indices will be ignores.
- */
- int max_nodes_group;
-
- /* Features bitfield indicating which features from the requested group
- * will be compiled in. Nodes which corresponds to features which are not
- * in this bitfield will be ignored even if they're in the requested group.
- */
- int nodes_features;
-
- /* BVH/sampling kernel features. */
- bool use_hair;
- bool use_hair_thick;
- bool use_object_motion;
- bool use_camera_motion;
-
- /* Denotes whether baking functionality is needed. */
- bool use_baking;
-
- /* Use subsurface scattering materials. */
- bool use_subsurface;
-
- /* Use volume materials. */
- bool use_volume;
-
- /* Use branched integrator. */
- bool use_integrator_branched;
-
- /* Use OpenSubdiv patch evaluation */
- bool use_patch_evaluation;
-
- /* Use Transparent shadows */
- bool use_transparent;
-
- /* Use various shadow tricks, such as shadow catcher. */
- bool use_shadow_tricks;
-
- /* Per-uber shader usage flags. */
- bool use_principled;
-
- /* Denoising features. */
- bool use_denoising;
-
- /* Use raytracing in shaders. */
- bool use_shader_raytrace;
-
- /* Use true displacement */
- bool use_true_displacement;
-
- /* Use background lights */
- bool use_background_light;
-
- DeviceRequestedFeatures()
- {
- /* TODO(sergey): Find more meaningful defaults. */
- max_nodes_group = 0;
- nodes_features = 0;
- use_hair = false;
- use_hair_thick = false;
- use_object_motion = false;
- use_camera_motion = false;
- use_baking = false;
- use_subsurface = false;
- use_volume = false;
- use_integrator_branched = false;
- use_patch_evaluation = false;
- use_transparent = false;
- use_shadow_tricks = false;
- use_principled = false;
- use_denoising = false;
- use_shader_raytrace = false;
- use_true_displacement = false;
- use_background_light = false;
- }
-
- bool modified(const DeviceRequestedFeatures &requested_features)
- {
- return !(max_nodes_group == requested_features.max_nodes_group &&
- nodes_features == requested_features.nodes_features &&
- use_hair == requested_features.use_hair &&
- use_hair_thick == requested_features.use_hair_thick &&
- use_object_motion == requested_features.use_object_motion &&
- use_camera_motion == requested_features.use_camera_motion &&
- use_baking == requested_features.use_baking &&
- use_subsurface == requested_features.use_subsurface &&
- use_volume == requested_features.use_volume &&
- use_integrator_branched == requested_features.use_integrator_branched &&
- use_patch_evaluation == requested_features.use_patch_evaluation &&
- use_transparent == requested_features.use_transparent &&
- use_shadow_tricks == requested_features.use_shadow_tricks &&
- use_principled == requested_features.use_principled &&
- use_denoising == requested_features.use_denoising &&
- use_shader_raytrace == requested_features.use_shader_raytrace &&
- use_true_displacement == requested_features.use_true_displacement &&
- use_background_light == requested_features.use_background_light);
- }
-
- /* Convert the requested features structure to a build options,
- * which could then be passed to compilers.
- */
- string get_build_options() const
- {
- string build_options = "";
- if (experimental) {
- build_options += "-D__KERNEL_EXPERIMENTAL__ ";
- }
- build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group);
- build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features);
- if (!use_hair) {
- build_options += " -D__NO_HAIR__";
- }
- if (!use_object_motion) {
- build_options += " -D__NO_OBJECT_MOTION__";
- }
- if (!use_camera_motion) {
- build_options += " -D__NO_CAMERA_MOTION__";
- }
- if (!use_baking) {
- build_options += " -D__NO_BAKING__";
- }
- if (!use_volume) {
- build_options += " -D__NO_VOLUME__";
- }
- if (!use_subsurface) {
- build_options += " -D__NO_SUBSURFACE__";
- }
- if (!use_integrator_branched) {
- build_options += " -D__NO_BRANCHED_PATH__";
- }
- if (!use_patch_evaluation) {
- build_options += " -D__NO_PATCH_EVAL__";
- }
- if (!use_transparent && !use_volume) {
- build_options += " -D__NO_TRANSPARENT__";
- }
- if (!use_shadow_tricks) {
- build_options += " -D__NO_SHADOW_TRICKS__";
- }
- if (!use_principled) {
- build_options += " -D__NO_PRINCIPLED__";
- }
- if (!use_denoising) {
- build_options += " -D__NO_DENOISING__";
- }
- if (!use_shader_raytrace) {
- build_options += " -D__NO_SHADER_RAYTRACE__";
- }
- return build_options;
- }
};
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features);
-
/* Device */
-struct DeviceDrawParams {
- function<void()> bind_display_space_shader_cb;
- function<void()> unbind_display_space_shader_cb;
-};
-
class Device {
friend class device_sub_ptr;
protected:
- enum {
- FALLBACK_SHADER_STATUS_NONE = 0,
- FALLBACK_SHADER_STATUS_ERROR,
- FALLBACK_SHADER_STATUS_SUCCESS,
- };
-
- Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background)
- : background(background),
- vertex_buffer(0),
- fallback_status(FALLBACK_SHADER_STATUS_NONE),
- fallback_shader_program(0),
- info(info_),
- stats(stats_),
- profiler(profiler_)
+ Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+ : info(info_), stats(stats_), profiler(profiler_)
{
}
- bool background;
string error_msg;
- /* used for real time display */
- unsigned int vertex_buffer;
- int fallback_status, fallback_shader_program;
- int image_texture_location, fullscreen_location;
-
- bool bind_fallback_display_space_shader(const float width, const float height);
-
virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
{
/* Only required for devices that implement denoising. */
@@ -361,67 +159,31 @@ class Device {
Stats &stats;
Profiler &profiler;
- /* memory alignment */
- virtual int mem_sub_ptr_alignment()
- {
- return MIN_ALIGNMENT_CPU_DATA_TYPES;
- }
-
/* constant memory */
virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
- /* open shading language, only for CPU device */
- virtual void *osl_memory()
- {
- return NULL;
- }
-
/* load/compile kernels, must be called before adding tasks */
- virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
+ virtual bool load_kernels(uint /*kernel_features*/)
{
return true;
}
- /* Wait for device to become available to upload data and receive tasks
- * This method is used by the OpenCL device to load the
- * optimized kernels or when not (yet) available load the
- * generic kernels (only during foreground rendering) */
- virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/)
- {
- return true;
- }
- /* Check if there are 'better' kernels available to be used
- * We can switch over to these kernels
- * This method is used to determine if we can switch the preview kernels
- * to regular kernels */
- virtual DeviceKernelStatus get_active_kernel_switch_state()
- {
- return DEVICE_KERNEL_USING_FEATURE_KERNEL;
- }
+ /* GPU device only functions.
+ * These may not be used on CPU or multi-devices. */
- /* tasks */
- virtual int get_split_task_count(DeviceTask &)
- {
- return 1;
- }
+ /* Create new queue for executing kernels in. */
+ virtual unique_ptr<DeviceQueue> gpu_queue_create();
+
+ /* CPU device only functions.
+ * These may not be used on GPU or multi-devices. */
- virtual void task_add(DeviceTask &task) = 0;
- virtual void task_wait() = 0;
- virtual void task_cancel() = 0;
-
- /* opengl drawing */
- virtual void draw_pixels(device_memory &mem,
- int y,
- int w,
- int h,
- int width,
- int height,
- int dx,
- int dy,
- int dw,
- int dh,
- bool transparent,
- const DeviceDrawParams &draw_params);
+ /* Get CPU kernel functions for native instruction set. */
+ virtual const CPUKernels *get_cpu_kernels() const;
+ /* Get kernel globals to pass to kernels. */
+ virtual void get_cpu_kernel_thread_globals(
+ vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
+ /* Get OpenShadingLanguage memory buffer. */
+ virtual void *get_cpu_osl_memory();
/* acceleration structure building */
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
@@ -429,25 +191,11 @@ class Device {
/* OptiX specific destructor. */
virtual void release_optix_bvh(BVH * /*bvh*/){};
-#ifdef WITH_NETWORK
- /* networking */
- void server_run();
-#endif
-
/* multi device */
- virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/)
- {
- }
virtual int device_number(Device * /*sub_device*/)
{
return 0;
}
- virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
- {
- }
- virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
- {
- }
virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
{
@@ -460,11 +208,47 @@ class Device {
return false;
}
+ /* Graphics resources interoperability.
+ *
+ * The interoperability comes here by the meaning that the device is capable of computing result
+ * directly into an OpenGL (or other graphics library) buffer. */
+
+ /* Check display is to be updated using graphics interoperability.
+ * The interoperability can not be used is it is not supported by the device. But the device
+ * might also force disable the interoperability if it detects that it will be slower than
+ * copying pixels from the render buffer. */
+ virtual bool should_use_graphics_interop()
+ {
+ return false;
+ }
+
+ /* Buffer denoising. */
+
+ /* Returns true if task is fully handled. */
+ virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/)
+ {
+ LOG(ERROR) << "Request buffer denoising from a device which does not support it.";
+ return false;
+ }
+
+ virtual DeviceQueue *get_denoise_queue()
+ {
+ LOG(ERROR) << "Request denoising queue from a device which does not support it.";
+ return nullptr;
+ }
+
+ /* Sub-devices */
+
+ /* Run given callback for every individual device which will be handling rendering.
+ * For the single device the callback is called for the device itself. For the multi-device the
+ * callback is only called for the sub-devices. */
+ virtual void foreach_device(const function<void(Device *)> &callback)
+ {
+ callback(this);
+ }
+
/* static */
- static Device *create(DeviceInfo &info,
- Stats &stats,
- Profiler &profiler,
- bool background = true);
+ static Device *create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
static DeviceType type_from_string(const char *name);
static string string_from_type(DeviceType type);
@@ -499,9 +283,7 @@ class Device {
static thread_mutex device_mutex;
static vector<DeviceInfo> cuda_devices;
static vector<DeviceInfo> optix_devices;
- static vector<DeviceInfo> opencl_devices;
static vector<DeviceInfo> cpu_devices;
- static vector<DeviceInfo> network_devices;
static uint devices_initialized_mask;
};
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
deleted file mode 100644
index 4a6e77d6eaa..00000000000
--- a/intern/cycles/device/device_cpu.cpp
+++ /dev/null
@@ -1,1680 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-/* So ImathMath is included before our kernel_cpu_compat. */
-#ifdef WITH_OSL
-/* So no context pollution happens from indirectly included windows.h */
-# include "util/util_windows.h"
-# include <OSL/oslexec.h>
-#endif
-
-#ifdef WITH_EMBREE
-# include <embree3/rtcore.h>
-#endif
-
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_intern.h"
-#include "device/device_split_kernel.h"
-
-// clang-format off
-#include "kernel/kernel.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-#include "kernel/filter/filter.h"
-
-#include "kernel/osl/osl_shader.h"
-#include "kernel/osl/osl_globals.h"
-// clang-format on
-
-#include "bvh/bvh_embree.h"
-
-#include "render/buffers.h"
-#include "render/coverage.h"
-
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_function.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_opengl.h"
-#include "util/util_openimagedenoise.h"
-#include "util/util_optimization.h"
-#include "util/util_progress.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_thread.h"
-
-CCL_NAMESPACE_BEGIN
-
-class CPUDevice;
-
-/* Has to be outside of the class to be shared across template instantiations. */
-static const char *logged_architecture = "";
-
-template<typename F> class KernelFunctions {
- public:
- KernelFunctions()
- {
- kernel = (F)NULL;
- }
-
- KernelFunctions(
- F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
- {
- const char *architecture_name = "default";
- kernel = kernel_default;
-
- /* Silence potential warnings about unused variables
- * when compiling without some architectures. */
- (void)kernel_sse2;
- (void)kernel_sse3;
- (void)kernel_sse41;
- (void)kernel_avx;
- (void)kernel_avx2;
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
- architecture_name = "AVX2";
- kernel = kernel_avx2;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
- architecture_name = "AVX";
- kernel = kernel_avx;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
- architecture_name = "SSE4.1";
- kernel = kernel_sse41;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
- architecture_name = "SSE3";
- kernel = kernel_sse3;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
- architecture_name = "SSE2";
- kernel = kernel_sse2;
- }
-#else
- {
- /* Dummy to prevent the architecture if below become
- * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- * is not defined. */
- }
-#endif
-
- if (strcmp(architecture_name, logged_architecture) != 0) {
- VLOG(1) << "Will be using " << architecture_name << " kernels.";
- logged_architecture = architecture_name;
- }
- }
-
- inline F operator()() const
- {
- assert(kernel);
- return kernel;
- }
-
- protected:
- F kernel;
-};
-
-class CPUSplitKernel : public DeviceSplitKernel {
- CPUDevice *device;
-
- public:
- explicit CPUSplitKernel(CPUDevice *device);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory &kernel_globals,
- device_memory &kernel_data_,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flag,
- device_memory &work_pool_wgs);
-
- virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
- const DeviceRequestedFeatures &);
- virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
- virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-};
-
-class CPUDevice : public Device {
- public:
- TaskPool task_pool;
- KernelGlobals kernel_globals;
-
- device_vector<TextureInfo> texture_info;
- bool need_texture_info;
-
-#ifdef WITH_OSL
- OSLGlobals osl_globals;
-#endif
-#ifdef WITH_OPENIMAGEDENOISE
- oidn::DeviceRef oidn_device;
- oidn::FilterRef oidn_filter;
-#endif
- thread_spin_lock oidn_task_lock;
-#ifdef WITH_EMBREE
- RTCScene embree_scene = NULL;
- RTCDevice embree_device;
-#endif
-
- bool use_split_kernel;
-
- DeviceRequestedFeatures requested_features;
-
- KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
- KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
- convert_to_half_float_kernel;
- KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
- convert_to_byte_kernel;
- KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
- shader_kernel;
- KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel;
-
- KernelFunctions<void (*)(
- int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
- filter_divide_shadow_kernel;
- KernelFunctions<void (*)(
- int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
- filter_get_feature_kernel;
- KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
- filter_write_feature_kernel;
- KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
- filter_detect_outliers_kernel;
- KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
- filter_combine_halves_kernel;
-
- KernelFunctions<void (*)(
- int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
- filter_nlm_calc_difference_kernel;
- KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
- KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
- KernelFunctions<void (*)(
- int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
- filter_nlm_update_output_kernel;
- KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
-
- KernelFunctions<void (*)(
- float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
- filter_construct_transform_kernel;
- KernelFunctions<void (*)(int,
- int,
- int,
- float *,
- float *,
- float *,
- int *,
- float *,
- float3 *,
- int *,
- int *,
- int,
- int,
- int,
- int,
- bool)>
- filter_nlm_construct_gramian_kernel;
- KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
- filter_finalize_kernel;
-
- KernelFunctions<void (*)(KernelGlobals *,
- ccl_constant KernelData *,
- ccl_global void *,
- int,
- ccl_global char *,
- int,
- int,
- int,
- int,
- int,
- int,
- int,
- int,
- ccl_global int *,
- int,
- ccl_global char *,
- ccl_global unsigned int *,
- unsigned int,
- ccl_global float *)>
- data_init_kernel;
- unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
-
-#define KERNEL_FUNCTIONS(name) \
- KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
- KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
- KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
-
- CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
- : Device(info_, stats_, profiler_, background_),
- texture_info(this, "__texture_info", MEM_GLOBAL),
-#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
- REGISTER_KERNEL(path_trace),
- REGISTER_KERNEL(convert_to_half_float),
- REGISTER_KERNEL(convert_to_byte),
- REGISTER_KERNEL(shader),
- REGISTER_KERNEL(bake),
- REGISTER_KERNEL(filter_divide_shadow),
- REGISTER_KERNEL(filter_get_feature),
- REGISTER_KERNEL(filter_write_feature),
- REGISTER_KERNEL(filter_detect_outliers),
- REGISTER_KERNEL(filter_combine_halves),
- REGISTER_KERNEL(filter_nlm_calc_difference),
- REGISTER_KERNEL(filter_nlm_blur),
- REGISTER_KERNEL(filter_nlm_calc_weight),
- REGISTER_KERNEL(filter_nlm_update_output),
- REGISTER_KERNEL(filter_nlm_normalize),
- REGISTER_KERNEL(filter_construct_transform),
- REGISTER_KERNEL(filter_nlm_construct_gramian),
- REGISTER_KERNEL(filter_finalize),
- REGISTER_KERNEL(data_init)
-#undef REGISTER_KERNEL
- {
- if (info.cpu_threads == 0) {
- info.cpu_threads = TaskScheduler::num_threads();
- }
-
-#ifdef WITH_OSL
- kernel_globals.osl = &osl_globals;
-#endif
-#ifdef WITH_EMBREE
- embree_device = rtcNewDevice("verbose=0");
-#endif
- use_split_kernel = DebugFlags().cpu.split_kernel;
- if (use_split_kernel) {
- VLOG(1) << "Will be using split kernel.";
- }
- need_texture_info = false;
-
-#define REGISTER_SPLIT_KERNEL(name) \
- split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
- KERNEL_FUNCTIONS(name))
- REGISTER_SPLIT_KERNEL(path_init);
- REGISTER_SPLIT_KERNEL(scene_intersect);
- REGISTER_SPLIT_KERNEL(lamp_emission);
- REGISTER_SPLIT_KERNEL(do_volume);
- REGISTER_SPLIT_KERNEL(queue_enqueue);
- REGISTER_SPLIT_KERNEL(indirect_background);
- REGISTER_SPLIT_KERNEL(shader_setup);
- REGISTER_SPLIT_KERNEL(shader_sort);
- REGISTER_SPLIT_KERNEL(shader_eval);
- REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
- REGISTER_SPLIT_KERNEL(subsurface_scatter);
- REGISTER_SPLIT_KERNEL(direct_lighting);
- REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
- REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
- REGISTER_SPLIT_KERNEL(enqueue_inactive);
- REGISTER_SPLIT_KERNEL(next_iteration_setup);
- REGISTER_SPLIT_KERNEL(indirect_subsurface);
- REGISTER_SPLIT_KERNEL(buffer_update);
- REGISTER_SPLIT_KERNEL(adaptive_stopping);
- REGISTER_SPLIT_KERNEL(adaptive_filter_x);
- REGISTER_SPLIT_KERNEL(adaptive_filter_y);
- REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
-#undef REGISTER_SPLIT_KERNEL
-#undef KERNEL_FUNCTIONS
- }
-
- ~CPUDevice()
- {
-#ifdef WITH_EMBREE
- rtcReleaseDevice(embree_device);
-#endif
- task_pool.cancel();
- texture_info.free();
- }
-
- virtual bool show_samples() const override
- {
- return (info.cpu_threads == 1);
- }
-
- virtual BVHLayoutMask get_bvh_layout_mask() const override
- {
- BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
-#ifdef WITH_EMBREE
- bvh_layout_mask |= BVH_LAYOUT_EMBREE;
-#endif /* WITH_EMBREE */
- return bvh_layout_mask;
- }
-
- void load_texture_info()
- {
- if (need_texture_info) {
- texture_info.copy_to_device();
- need_texture_info = false;
- }
- }
-
- virtual void mem_alloc(device_memory &mem) override
- {
- if (mem.type == MEM_TEXTURE) {
- assert(!"mem_alloc not supported for textures.");
- }
- else if (mem.type == MEM_GLOBAL) {
- assert(!"mem_alloc not supported for global memory.");
- }
- else {
- if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
- }
-
- if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
- size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
- void *data = util_aligned_malloc(mem.memory_size(), alignment);
- mem.device_pointer = (device_ptr)data;
- }
- else {
- mem.device_pointer = (device_ptr)mem.host_pointer;
- }
-
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
- }
- }
-
- virtual void mem_copy_to(device_memory &mem) override
- {
- if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- global_alloc(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- tex_alloc((device_texture &)mem);
- }
- else if (mem.type == MEM_PIXELS) {
- assert(!"mem_copy_to not supported for pixels.");
- }
- else {
- if (!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- /* copy is no-op */
- }
- }
-
- virtual void mem_copy_from(
- device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) override
- {
- /* no-op */
- }
-
- virtual void mem_zero(device_memory &mem) override
- {
- if (!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- if (mem.device_pointer) {
- memset((void *)mem.device_pointer, 0, mem.memory_size());
- }
- }
-
- virtual void mem_free(device_memory &mem) override
- {
- if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- }
- else if (mem.device_pointer) {
- if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
- util_aligned_free((void *)mem.device_pointer);
- }
- mem.device_pointer = 0;
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
- }
-
- virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
- {
- return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
- }
-
- virtual void const_copy_to(const char *name, void *host, size_t size) override
- {
-#if WITH_EMBREE
- if (strcmp(name, "__data") == 0) {
- assert(size <= sizeof(KernelData));
-
- // Update scene handle (since it is different for each device on multi devices)
- KernelData *const data = (KernelData *)host;
- data->bvh.scene = embree_scene;
- }
-#endif
- kernel_const_copy(&kernel_globals, name, host, size);
- }
-
- void global_alloc(device_memory &mem)
- {
- VLOG(1) << "Global memory allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
-
- mem.device_pointer = (device_ptr)mem.host_pointer;
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
- }
-
- void global_free(device_memory &mem)
- {
- if (mem.device_pointer) {
- mem.device_pointer = 0;
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
- }
-
- void tex_alloc(device_texture &mem)
- {
- VLOG(1) << "Texture allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- mem.device_pointer = (device_ptr)mem.host_pointer;
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
-
- const uint slot = mem.slot;
- if (slot >= texture_info.size()) {
- /* Allocate some slots in advance, to reduce amount of re-allocations. */
- texture_info.resize(slot + 128);
- }
-
- texture_info[slot] = mem.info;
- texture_info[slot].data = (uint64_t)mem.host_pointer;
- need_texture_info = true;
- }
-
- void tex_free(device_texture &mem)
- {
- if (mem.device_pointer) {
- mem.device_pointer = 0;
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- need_texture_info = true;
- }
- }
-
- virtual void *osl_memory() override
- {
-#ifdef WITH_OSL
- return &osl_globals;
-#else
- return NULL;
-#endif
- }
-
- void build_bvh(BVH *bvh, Progress &progress, bool refit) override
- {
-#ifdef WITH_EMBREE
- if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
- bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
- BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
- if (refit) {
- bvh_embree->refit(progress);
- }
- else {
- bvh_embree->build(progress, &stats, embree_device);
- }
-
- if (bvh->params.top_level) {
- embree_scene = bvh_embree->scene;
- }
- }
- else
-#endif
- Device::build_bvh(bvh, progress, refit);
- }
-
- void thread_run(DeviceTask &task)
- {
- if (task.type == DeviceTask::RENDER)
- thread_render(task);
- else if (task.type == DeviceTask::SHADER)
- thread_shader(task);
- else if (task.type == DeviceTask::FILM_CONVERT)
- thread_film_convert(task);
- else if (task.type == DeviceTask::DENOISE_BUFFER)
- thread_denoise(task);
- }
-
- bool denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
-
- int4 rect = task->rect;
- int r = task->nlm_state.r;
- int f = task->nlm_state.f;
- float a = task->nlm_state.a;
- float k_2 = task->nlm_state.k_2;
-
- int w = align_up(rect.z - rect.x, 4);
- int h = rect.w - rect.y;
- int stride = task->buffer.stride;
- int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
- float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
- float *blurDifference = temporary_mem;
- float *difference = temporary_mem + task->buffer.pass_stride;
- float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
-
- memset(weightAccum, 0, sizeof(float) * w * h);
- memset((float *)out_ptr, 0, sizeof(float) * w * h);
-
- for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
- int dy = i / (2 * r + 1) - r;
- int dx = i % (2 * r + 1) - r;
-
- int local_rect[4] = {
- max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
- filter_nlm_calc_difference_kernel()(dx,
- dy,
- (float *)guide_ptr,
- (float *)variance_ptr,
- NULL,
- difference,
- local_rect,
- w,
- channel_offset,
- 0,
- a,
- k_2);
-
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
- filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-
- filter_nlm_update_output_kernel()(dx,
- dy,
- blurDifference,
- (float *)image_ptr,
- difference,
- (float *)out_ptr,
- weightAccum,
- local_rect,
- channel_offset,
- stride,
- f);
- }
-
- int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
- filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
-
- return true;
- }
-
- bool denoising_construct_transform(DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
-
- for (int y = 0; y < task->filter_area.w; y++) {
- for (int x = 0; x < task->filter_area.z; x++) {
- filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
- task->tile_info,
- x + task->filter_area.x,
- y + task->filter_area.y,
- y * task->filter_area.z + x,
- (float *)task->storage.transform.device_pointer,
- (int *)task->storage.rank.device_pointer,
- &task->rect.x,
- task->buffer.pass_stride,
- task->buffer.frame_stride,
- task->buffer.use_time,
- task->radius,
- task->pca_threshold);
- }
- }
- return true;
- }
-
- bool denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
-
- float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
- float *difference = temporary_mem;
- float *blurDifference = temporary_mem + task->buffer.pass_stride;
-
- int r = task->radius;
- int frame_offset = frame * task->buffer.frame_stride;
- for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
- int dy = i / (2 * r + 1) - r;
- int dx = i % (2 * r + 1) - r;
-
- int local_rect[4] = {max(0, -dx),
- max(0, -dy),
- task->reconstruction_state.source_w - max(0, dx),
- task->reconstruction_state.source_h - max(0, dy)};
- filter_nlm_calc_difference_kernel()(dx,
- dy,
- (float *)color_ptr,
- (float *)color_variance_ptr,
- (float *)scale_ptr,
- difference,
- local_rect,
- task->buffer.stride,
- task->buffer.pass_stride,
- frame_offset,
- 1.0f,
- task->nlm_k_2);
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
- filter_nlm_calc_weight_kernel()(
- blurDifference, difference, local_rect, task->buffer.stride, 4);
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
- filter_nlm_construct_gramian_kernel()(dx,
- dy,
- task->tile_info->frames[frame],
- blurDifference,
- (float *)task->buffer.mem.device_pointer,
- (float *)task->storage.transform.device_pointer,
- (int *)task->storage.rank.device_pointer,
- (float *)task->storage.XtWX.device_pointer,
- (float3 *)task->storage.XtWY.device_pointer,
- local_rect,
- &task->reconstruction_state.filter_window.x,
- task->buffer.stride,
- 4,
- task->buffer.pass_stride,
- frame_offset,
- task->buffer.use_time);
- }
-
- return true;
- }
-
- bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
- {
- for (int y = 0; y < task->filter_area.w; y++) {
- for (int x = 0; x < task->filter_area.z; x++) {
- filter_finalize_kernel()(x,
- y,
- y * task->filter_area.z + x,
- (float *)output_ptr,
- (int *)task->storage.rank.device_pointer,
- (float *)task->storage.XtWX.device_pointer,
- (float3 *)task->storage.XtWY.device_pointer,
- &task->reconstruction_state.buffer_params.x,
- task->render_buffer.samples);
- }
- }
- return true;
- }
-
- bool denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
-
- for (int y = rect.y; y < rect.w; y++) {
- for (int x = rect.x; x < rect.z; x++) {
- filter_combine_halves_kernel()(x,
- y,
- (float *)mean_ptr,
- (float *)variance_ptr,
- (float *)a_ptr,
- (float *)b_ptr,
- &rect.x,
- r);
- }
- }
- return true;
- }
-
- bool denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
-
- for (int y = task->rect.y; y < task->rect.w; y++) {
- for (int x = task->rect.x; x < task->rect.z; x++) {
- filter_divide_shadow_kernel()(task->render_buffer.samples,
- task->tile_info,
- x,
- y,
- (float *)a_ptr,
- (float *)b_ptr,
- (float *)sample_variance_ptr,
- (float *)sv_variance_ptr,
- (float *)buffer_variance_ptr,
- &task->rect.x,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- }
- }
- return true;
- }
-
- bool denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
-
- for (int y = task->rect.y; y < task->rect.w; y++) {
- for (int x = task->rect.x; x < task->rect.z; x++) {
- filter_get_feature_kernel()(task->render_buffer.samples,
- task->tile_info,
- mean_offset,
- variance_offset,
- x,
- y,
- (float *)mean_ptr,
- (float *)variance_ptr,
- scale,
- &task->rect.x,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- }
- }
- return true;
- }
-
- bool denoising_write_feature(int out_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task)
- {
- for (int y = 0; y < task->filter_area.w; y++) {
- for (int x = 0; x < task->filter_area.z; x++) {
- filter_write_feature_kernel()(task->render_buffer.samples,
- x + task->filter_area.x,
- y + task->filter_area.y,
- &task->reconstruction_state.buffer_params.x,
- (float *)from_ptr,
- (float *)buffer_ptr,
- out_offset,
- &task->rect.x);
- }
- }
- return true;
- }
-
- bool denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
-
- for (int y = task->rect.y; y < task->rect.w; y++) {
- for (int x = task->rect.x; x < task->rect.z; x++) {
- filter_detect_outliers_kernel()(x,
- y,
- (float *)image_ptr,
- (float *)variance_ptr,
- (float *)depth_ptr,
- (float *)output_ptr,
- &task->rect.x,
- task->buffer.pass_stride);
- }
- }
- return true;
- }
-
- bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
- {
- WorkTile wtile;
- wtile.x = tile.x;
- wtile.y = tile.y;
- wtile.w = tile.w;
- wtile.h = tile.h;
- wtile.offset = tile.offset;
- wtile.stride = tile.stride;
- wtile.buffer = (float *)tile.buffer;
-
- /* For CPU we do adaptive stopping per sample so we can stop earlier, but
- * for combined CPU + GPU rendering we match the GPU and do it per tile
- * after a given number of sample steps. */
- if (!kernel_data.integrator.adaptive_stop_per_sample) {
- for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
- for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
- const int index = wtile.offset + x + y * wtile.stride;
- float *buffer = wtile.buffer + index * kernel_data.film.pass_stride;
- kernel_do_adaptive_stopping(kg, buffer, sample);
- }
- }
- }
-
- bool any = false;
- for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
- any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
- }
- for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
- any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
- }
- return (!any);
- }
-
- void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
- {
- float *render_buffer = (float *)tile.buffer;
- for (int y = tile.y; y < tile.y + tile.h; y++) {
- for (int x = tile.x; x < tile.x + tile.w; x++) {
- int index = tile.offset + x + y * tile.stride;
- ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
- if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
- buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
- float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count];
- if (sample_multiplier != 1.0f) {
- kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
- }
- }
- else {
- kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
- }
- }
- }
- }
-
- void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
- {
- const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
- scoped_timer timer(&tile.buffers->render_time);
-
- Coverage coverage(kg, tile);
- if (use_coverage) {
- coverage.init_path_trace();
- }
-
- float *render_buffer = (float *)tile.buffer;
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- /* Needed for Embree. */
- SIMD_SET_FLUSH_TO_ZERO;
-
- for (int sample = start_sample; sample < end_sample; sample++) {
- if (task.get_cancel() || TaskPool::canceled()) {
- if (task.need_finish_queue == false)
- break;
- }
-
- if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
- tile.stealing_state = RenderTile::WAS_STOLEN;
- break;
- }
-
- if (tile.task == RenderTile::PATH_TRACE) {
- for (int y = tile.y; y < tile.y + tile.h; y++) {
- for (int x = tile.x; x < tile.x + tile.w; x++) {
- if (use_coverage) {
- coverage.init_pixel(x, y);
- }
- path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
- }
- }
- }
- else {
- for (int y = tile.y; y < tile.y + tile.h; y++) {
- for (int x = tile.x; x < tile.x + tile.w; x++) {
- bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
- }
- }
- }
- tile.sample = sample + 1;
-
- if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
- const bool stop = adaptive_sampling_filter(kg, tile, sample);
- if (stop) {
- const int num_progress_samples = end_sample - sample;
- tile.sample = end_sample;
- task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
- break;
- }
- }
-
- task.update_progress(&tile, tile.w * tile.h);
- }
- if (use_coverage) {
- coverage.finalize();
- }
-
- if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
- adaptive_sampling_post(tile, kg);
- }
- }
-
- void denoise_openimagedenoise_buffer(DeviceTask &task,
- float *buffer,
- const size_t offset,
- const size_t stride,
- const size_t x,
- const size_t y,
- const size_t w,
- const size_t h,
- const float scale)
- {
-#ifdef WITH_OPENIMAGEDENOISE
- assert(openimagedenoise_supported());
-
- /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
- * buffers, and for tiled rendering because creating multiple devices and filters
- * is slow and memory hungry as well.
- *
- * TODO: optimize tiled rendering case, by batching together denoising of many
- * tiles somehow? */
- static thread_mutex mutex;
- thread_scoped_lock lock(mutex);
-
- /* Create device and filter, cached for reuse. */
- if (!oidn_device) {
- oidn_device = oidn::newDevice();
- oidn_device.commit();
- }
- if (!oidn_filter) {
- oidn_filter = oidn_device.newFilter("RT");
- oidn_filter.set("hdr", true);
- oidn_filter.set("srgb", false);
- }
-
- /* Set images with appropriate stride for our interleaved pass storage. */
- struct {
- const char *name;
- const int offset;
- const bool scale;
- const bool use;
- array<float> scaled_buffer;
- } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
- {"albedo",
- task.pass_denoising_data + DENOISING_PASS_ALBEDO,
- true,
- task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
- {"normal",
- task.pass_denoising_data + DENOISING_PASS_NORMAL,
- true,
- task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
- {"output", 0, false, true},
- { NULL,
- 0 }};
-
- for (int i = 0; passes[i].name; i++) {
- if (!passes[i].use) {
- continue;
- }
-
- const int64_t pixel_offset = offset + x + y * stride;
- const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
- const int64_t pixel_stride = task.pass_stride;
- const int64_t row_stride = stride * pixel_stride;
-
- if (passes[i].scale && scale != 1.0f) {
- /* Normalize albedo and normal passes as they are scaled by the number of samples.
- * For the color passes OIDN will perform auto-exposure making it unnecessary. */
- array<float> &scaled_buffer = passes[i].scaled_buffer;
- scaled_buffer.resize(w * h * 3);
-
- for (int y = 0; y < h; y++) {
- const float *pass_row = buffer + buffer_offset + y * row_stride;
- float *scaled_row = scaled_buffer.data() + y * w * 3;
-
- for (int x = 0; x < w; x++) {
- scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
- scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
- scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
- }
- }
-
- oidn_filter.setImage(
- passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
- }
- else {
- oidn_filter.setImage(passes[i].name,
- buffer + buffer_offset,
- oidn::Format::Float3,
- w,
- h,
- 0,
- pixel_stride * sizeof(float),
- row_stride * sizeof(float));
- }
- }
-
- /* Execute filter. */
- oidn_filter.commit();
- oidn_filter.execute();
-#else
- (void)task;
- (void)buffer;
- (void)offset;
- (void)stride;
- (void)x;
- (void)y;
- (void)w;
- (void)h;
- (void)scale;
-#endif
- }
-
- void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
- {
- if (task.type == DeviceTask::DENOISE_BUFFER) {
- /* Copy pixels from compute device to CPU (no-op for CPU device). */
- rtile.buffers->buffer.copy_from_device();
-
- denoise_openimagedenoise_buffer(task,
- (float *)rtile.buffer,
- rtile.offset,
- rtile.stride,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- 1.0f / rtile.sample);
-
- /* todo: it may be possible to avoid this copy, but we have to ensure that
- * when other code copies data from the device it doesn't overwrite the
- * denoiser buffers. */
- rtile.buffers->buffer.copy_to_device();
- }
- else {
- /* Per-tile denoising. */
- rtile.sample = rtile.start_sample + rtile.num_samples;
- const float scale = 1.0f / rtile.sample;
- const float invscale = rtile.sample;
- const size_t pass_stride = task.pass_stride;
-
- /* Map neighboring tiles into one buffer for denoising. */
- RenderTileNeighbors neighbors(rtile);
- task.map_neighbor_tiles(neighbors, this);
- RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
- rtile = center_tile;
-
- /* Calculate size of the tile to denoise (including overlap). The overlap
- * size was chosen empirically. OpenImageDenoise specifies an overlap size
- * of 128 but this is significantly bigger than typical tile size. */
- const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
- const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-
- /* Adjacent tiles are in separate memory regions, copy into single buffer. */
- array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
-
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- RenderTile &ntile = neighbors.tiles[i];
- if (!ntile.buffer) {
- continue;
- }
-
- const int xmin = max(ntile.x, rect.x);
- const int ymin = max(ntile.y, rect.y);
- const int xmax = min(ntile.x + ntile.w, rect.z);
- const int ymax = min(ntile.y + ntile.h, rect.w);
-
- const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
- const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
- const size_t merged_stride = rect_size.x;
- const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
- float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
- for (int y = ymin; y < ymax; y++) {
- for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
- merged_buffer[x] = tile_buffer[x] * scale;
- }
- tile_buffer += ntile.stride * pass_stride;
- merged_buffer += merged_stride * pass_stride;
- }
- }
-
- /* Denoise */
- denoise_openimagedenoise_buffer(
- task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
-
- /* Copy back result from merged buffer. */
- RenderTile &ntile = neighbors.target;
- if (ntile.buffer) {
- const int xmin = max(ntile.x, rect.x);
- const int ymin = max(ntile.y, rect.y);
- const int xmax = min(ntile.x + ntile.w, rect.z);
- const int ymax = min(ntile.y + ntile.h, rect.w);
-
- const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
- float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
- const size_t merged_stride = rect_size.x;
- const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
- const float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
- for (int y = ymin; y < ymax; y++) {
- for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
- tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
- tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
- tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
- }
- tile_buffer += ntile.stride * pass_stride;
- merged_buffer += merged_stride * pass_stride;
- }
- }
-
- task.unmap_neighbor_tiles(neighbors, this);
- }
- }
-
- void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
- {
- ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
-
- tile.sample = tile.start_sample + tile.num_samples;
-
- denoising.functions.construct_transform = function_bind(
- &CPUDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(
- &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
- denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
- denoising.functions.divide_shadow = function_bind(
- &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.non_local_means = function_bind(
- &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
- denoising.functions.combine_halves = function_bind(
- &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(
- &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.write_feature = function_bind(
- &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
- denoising.functions.detect_outliers = function_bind(
- &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
- denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
- denoising.render_buffer.samples = tile.sample;
- denoising.buffer.gpu_temporary_mem = false;
-
- denoising.run_denoising(tile);
- }
-
- void thread_render(DeviceTask &task)
- {
- if (TaskPool::canceled()) {
- if (task.need_finish_queue == false)
- return;
- }
-
- /* allocate buffer for kernel globals */
- device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
- kgbuffer.alloc_to_device(1);
-
- KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
- KernelGlobals(thread_kernel_globals_init());
-
- profiler.add_state(&kg->profiler);
-
- CPUSplitKernel *split_kernel = NULL;
- if (use_split_kernel) {
- split_kernel = new CPUSplitKernel(this);
- if (!split_kernel->load_kernels(requested_features)) {
- thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
- kgbuffer.free();
- delete split_kernel;
- return;
- }
- }
-
- /* NLM denoiser. */
- DenoisingTask *denoising = NULL;
-
- /* OpenImageDenoise: we can only denoise with one thread at a time, so to
- * avoid waiting with mutex locks in the denoiser, we let only a single
- * thread acquire denoising tiles. */
- uint tile_types = task.tile_types;
- bool hold_denoise_lock = false;
- if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
- if (!oidn_task_lock.try_lock()) {
- tile_types &= ~RenderTile::DENOISE;
- hold_denoise_lock = true;
- }
- }
-
- RenderTile tile;
- while (task.acquire_tile(this, tile, tile_types)) {
- if (tile.task == RenderTile::PATH_TRACE) {
- if (use_split_kernel) {
- device_only_memory<uchar> void_buffer(this, "void_buffer");
- split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
- }
- else {
- render(task, tile, kg);
- }
- }
- else if (tile.task == RenderTile::BAKE) {
- render(task, tile, kg);
- }
- else if (tile.task == RenderTile::DENOISE) {
- if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
- denoise_openimagedenoise(task, tile);
- }
- else if (task.denoising.type == DENOISER_NLM) {
- if (denoising == NULL) {
- denoising = new DenoisingTask(this, task);
- denoising->profiler = &kg->profiler;
- }
- denoise_nlm(*denoising, tile);
- }
- task.update_progress(&tile, tile.w * tile.h);
- }
-
- task.release_tile(tile);
-
- if (TaskPool::canceled()) {
- if (task.need_finish_queue == false)
- break;
- }
- }
-
- if (hold_denoise_lock) {
- oidn_task_lock.unlock();
- }
-
- profiler.remove_state(&kg->profiler);
-
- thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
- kg->~KernelGlobals();
- kgbuffer.free();
- delete split_kernel;
- delete denoising;
- }
-
- void thread_denoise(DeviceTask &task)
- {
- RenderTile tile;
- tile.x = task.x;
- tile.y = task.y;
- tile.w = task.w;
- tile.h = task.h;
- tile.buffer = task.buffer;
- tile.sample = task.sample + task.num_samples;
- tile.num_samples = task.num_samples;
- tile.start_sample = task.sample;
- tile.offset = task.offset;
- tile.stride = task.stride;
- tile.buffers = task.buffers;
-
- if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
- denoise_openimagedenoise(task, tile);
- }
- else {
- DenoisingTask denoising(this, task);
-
- ProfilingState denoising_profiler_state;
- profiler.add_state(&denoising_profiler_state);
- denoising.profiler = &denoising_profiler_state;
-
- denoise_nlm(denoising, tile);
-
- profiler.remove_state(&denoising_profiler_state);
- }
-
- task.update_progress(&tile, tile.w * tile.h);
- }
-
- void thread_film_convert(DeviceTask &task)
- {
- float sample_scale = 1.0f / (task.sample + 1);
-
- if (task.rgba_half) {
- for (int y = task.y; y < task.y + task.h; y++)
- for (int x = task.x; x < task.x + task.w; x++)
- convert_to_half_float_kernel()(&kernel_globals,
- (uchar4 *)task.rgba_half,
- (float *)task.buffer,
- sample_scale,
- x,
- y,
- task.offset,
- task.stride);
- }
- else {
- for (int y = task.y; y < task.y + task.h; y++)
- for (int x = task.x; x < task.x + task.w; x++)
- convert_to_byte_kernel()(&kernel_globals,
- (uchar4 *)task.rgba_byte,
- (float *)task.buffer,
- sample_scale,
- x,
- y,
- task.offset,
- task.stride);
- }
- }
-
- void thread_shader(DeviceTask &task)
- {
- KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init());
-
- for (int sample = 0; sample < task.num_samples; sample++) {
- for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
- shader_kernel()(kg,
- (uint4 *)task.shader_input,
- (float4 *)task.shader_output,
- task.shader_eval_type,
- task.shader_filter,
- x,
- task.offset,
- sample);
-
- if (task.get_cancel() || TaskPool::canceled())
- break;
-
- task.update_progress(NULL);
- }
-
- thread_kernel_globals_free(kg);
- delete kg;
- }
-
- virtual int get_split_task_count(DeviceTask &task) override
- {
- if (task.type == DeviceTask::SHADER)
- return task.get_subtask_count(info.cpu_threads, 256);
- else
- return task.get_subtask_count(info.cpu_threads);
- }
-
- virtual void task_add(DeviceTask &task) override
- {
- /* Load texture info. */
- load_texture_info();
-
- /* split task into smaller ones */
- list<DeviceTask> tasks;
-
- if (task.type == DeviceTask::DENOISE_BUFFER &&
- task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
- /* Denoise entire buffer at once with OIDN, it has own threading. */
- tasks.push_back(task);
- }
- else if (task.type == DeviceTask::SHADER) {
- task.split(tasks, info.cpu_threads, 256);
- }
- else {
- task.split(tasks, info.cpu_threads);
- }
-
- foreach (DeviceTask &task, tasks) {
- task_pool.push([=] {
- DeviceTask task_copy = task;
- thread_run(task_copy);
- });
- }
- }
-
- virtual void task_wait() override
- {
- task_pool.wait_work();
- }
-
- virtual void task_cancel() override
- {
- task_pool.cancel();
- }
-
- protected:
- inline KernelGlobals thread_kernel_globals_init()
- {
- KernelGlobals kg = kernel_globals;
- kg.transparent_shadow_intersections = NULL;
- const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
- sizeof(*kg.decoupled_volume_steps);
- for (int i = 0; i < decoupled_count; ++i) {
- kg.decoupled_volume_steps[i] = NULL;
- }
- kg.decoupled_volume_steps_index = 0;
- kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
-#ifdef WITH_OSL
- OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
- return kg;
- }
-
- inline void thread_kernel_globals_free(KernelGlobals *kg)
- {
- if (kg == NULL) {
- return;
- }
-
- if (kg->transparent_shadow_intersections != NULL) {
- free(kg->transparent_shadow_intersections);
- }
- const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
- sizeof(*kg->decoupled_volume_steps);
- for (int i = 0; i < decoupled_count; ++i) {
- if (kg->decoupled_volume_steps[i] != NULL) {
- free(kg->decoupled_volume_steps[i]);
- }
- }
-#ifdef WITH_OSL
- OSLShader::thread_free(kg);
-#endif
- }
-
- virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override
- {
- requested_features = requested_features_;
-
- return true;
- }
-};
-
-/* split kernel */
-
-class CPUSplitKernelFunction : public SplitKernelFunction {
- public:
- CPUDevice *device;
- void (*func)(KernelGlobals *kg, KernelData *data);
-
- CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL)
- {
- }
- ~CPUSplitKernelFunction()
- {
- }
-
- virtual bool enqueue(const KernelDimensions &dim,
- device_memory &kernel_globals,
- device_memory &data)
- {
- if (!func) {
- return false;
- }
-
- KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
- kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
- for (int y = 0; y < dim.global_size[1]; y++) {
- for (int x = 0; x < dim.global_size[0]; x++) {
- kg->global_id = make_int2(x, y);
-
- func(kg, (KernelData *)data.device_pointer);
- }
- }
-
- return true;
- }
-};
-
-CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory &kernel_globals,
- device_memory &data,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flags,
- device_memory &work_pool_wgs)
-{
- KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
- kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
- for (int y = 0; y < dim.global_size[1]; y++) {
- for (int x = 0; x < dim.global_size[0]; x++) {
- kg->global_id = make_int2(x, y);
-
- device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
- (KernelData *)data.device_pointer,
- (void *)split_data.device_pointer,
- num_global_elements,
- (char *)ray_state.device_pointer,
- rtile.start_sample,
- rtile.start_sample + rtile.num_samples,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- rtile.offset,
- rtile.stride,
- (int *)queue_index.device_pointer,
- dim.global_size[0] * dim.global_size[1],
- (char *)use_queues_flags.device_pointer,
- (uint *)work_pool_wgs.device_pointer,
- rtile.num_samples,
- (float *)rtile.buffer);
- }
- }
-
- return true;
-}
-
-SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name,
- const DeviceRequestedFeatures &)
-{
- CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
-
- kernel->func = device->split_kernels[kernel_name]();
- if (!kernel->func) {
- delete kernel;
- return NULL;
- }
-
- return kernel;
-}
-
-int2 CPUSplitKernel::split_kernel_local_size()
-{
- return make_int2(1, 1);
-}
-
-int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
- device_memory & /*data*/,
- DeviceTask & /*task*/)
-{
- return make_int2(1, 1);
-}
-
-uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals,
- device_memory & /*data*/,
- size_t num_threads)
-{
- KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-
- return split_data_buffer_size(kg, num_threads);
-}
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
- return new CPUDevice(info, stats, profiler, background);
-}
-
-void device_cpu_info(vector<DeviceInfo> &devices)
-{
- DeviceInfo info;
-
- info.type = DEVICE_CPU;
- info.description = system_cpu_brand_string();
- info.id = "CPU";
- info.num = 0;
- info.has_volume_decoupled = true;
- info.has_adaptive_stop_per_sample = true;
- info.has_osl = true;
- info.has_half_images = true;
- info.has_nanovdb = true;
- info.has_profiling = true;
- info.denoisers = DENOISER_NLM;
- if (openimagedenoise_supported()) {
- info.denoisers |= DENOISER_OPENIMAGEDENOISE;
- }
-
- devices.insert(devices.begin(), info);
-}
-
-string device_cpu_capabilities()
-{
- string capabilities = "";
- capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
- capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
- capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
- capabilities += system_cpu_support_avx() ? "AVX " : "";
- capabilities += system_cpu_support_avx2() ? "AVX2" : "";
- if (capabilities[capabilities.size() - 1] == ' ')
- capabilities.resize(capabilities.size() - 1);
- return capabilities;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.cpp b/intern/cycles/device/device_denoise.cpp
new file mode 100644
index 00000000000..aea7868f65d
--- /dev/null
+++ b/intern/cycles/device/device_denoise.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *denoiserTypeToHumanReadable(DenoiserType type)
+{
+ switch (type) {
+ case DENOISER_OPTIX:
+ return "OptiX";
+ case DENOISER_OPENIMAGEDENOISE:
+ return "OpenImageDenoise";
+
+ case DENOISER_NUM:
+ case DENOISER_NONE:
+ case DENOISER_ALL:
+ return "UNKNOWN";
+ }
+
+ return "UNKNOWN";
+}
+
+const NodeEnum *DenoiseParams::get_type_enum()
+{
+ static NodeEnum type_enum;
+
+ if (type_enum.empty()) {
+ type_enum.insert("optix", DENOISER_OPTIX);
+ type_enum.insert("openimageio", DENOISER_OPENIMAGEDENOISE);
+ }
+
+ return &type_enum;
+}
+
+const NodeEnum *DenoiseParams::get_prefilter_enum()
+{
+ static NodeEnum prefilter_enum;
+
+ if (prefilter_enum.empty()) {
+ prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+ prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+ prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+ }
+
+ return &prefilter_enum;
+}
+
+NODE_DEFINE(DenoiseParams)
+{
+ NodeType *type = NodeType::add("denoise_params", create);
+
+ const NodeEnum *type_enum = get_type_enum();
+ const NodeEnum *prefilter_enum = get_prefilter_enum();
+
+ SOCKET_BOOLEAN(use, "Use", false);
+
+ SOCKET_ENUM(type, "Type", *type_enum, DENOISER_OPENIMAGEDENOISE);
+
+ SOCKET_INT(start_sample, "Start Sample", 0);
+
+ SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true);
+ SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false);
+
+ SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST);
+
+ return type;
+}
+
+DenoiseParams::DenoiseParams() : Node(get_node_type())
+{
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.h b/intern/cycles/device/device_denoise.h
new file mode 100644
index 00000000000..dfdc7cc87b3
--- /dev/null
+++ b/intern/cycles/device/device_denoise.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+#include "graph/node.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum DenoiserType {
+ DENOISER_OPTIX = 2,
+ DENOISER_OPENIMAGEDENOISE = 4,
+ DENOISER_NUM,
+
+ DENOISER_NONE = 0,
+ DENOISER_ALL = ~0,
+};
+
+/* COnstruct human-readable string which denotes the denoiser type. */
+const char *denoiserTypeToHumanReadable(DenoiserType type);
+
+typedef int DenoiserTypeMask;
+
+enum DenoiserPrefilter {
+ /* Best quality of the result without extra processing time, but requires guiding passes to be
+ * noise-free. */
+ DENOISER_PREFILTER_NONE = 1,
+
+ /* Denoise color and guiding passes together.
+ * Improves quality when guiding passes are noisy using least amount of extra processing time. */
+ DENOISER_PREFILTER_FAST = 2,
+
+ /* Prefilter noisy guiding passes before denoising color.
+ * Improves quality when guiding passes are noisy using extra processing time. */
+ DENOISER_PREFILTER_ACCURATE = 3,
+
+ DENOISER_PREFILTER_NUM,
+};
+
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization.
+ * The default values here do not really matter as they are always initialized from the
+ * Integrator node. */
+class DenoiseParams : public Node {
+ public:
+ NODE_DECLARE
+
+ /* Apply denoiser to image. */
+ bool use = false;
+
+ /* Denoiser type. */
+ DenoiserType type = DENOISER_OPENIMAGEDENOISE;
+
+ /* Viewport start sample. */
+ int start_sample = 0;
+
+ /* Auxiliary passes. */
+ bool use_pass_albedo = true;
+ bool use_pass_normal = true;
+
+ DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST;
+
+ static const NodeEnum *get_type_enum();
+ static const NodeEnum *get_prefilter_enum();
+
+ DenoiseParams();
+
+ bool modified(const DenoiseParams &other) const
+ {
+ return !(use == other.use && type == other.type && start_sample == other.start_sample &&
+ use_pass_albedo == other.use_pass_albedo &&
+ use_pass_normal == other.use_pass_normal && prefilter == other.prefilter);
+ }
+};
+
+/* All the parameters needed to perform buffer denoising on a device.
+ * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
+ * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
+ * single place where they are all listed, so that it's not required to modify all device methods
+ * when these parameters do change. */
+class DeviceDenoiseTask {
+ public:
+ DenoiseParams params;
+
+ int num_samples;
+
+ RenderBuffers *render_buffers;
+ BufferParams buffer_params;
+
+ /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
+ * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
+ * tracer) point of view. */
+ bool allow_inplace_modification;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
deleted file mode 100644
index 38c42d15cab..00000000000
--- a/intern/cycles/device/device_denoising.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_denoising.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
- : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
- profiler(NULL),
- storage(device),
- buffer(device),
- device(device)
-{
- radius = task.denoising.radius;
- nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
- if (task.denoising.relative_pca) {
- pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
- }
- else {
- pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
- }
-
- render_buffer.frame_stride = task.frame_stride;
- render_buffer.pass_stride = task.pass_stride;
- render_buffer.offset = task.pass_denoising_data;
-
- target_buffer.pass_stride = task.target_pass_stride;
- target_buffer.denoising_clean_offset = task.pass_denoising_clean;
- target_buffer.offset = 0;
-
- functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
- functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
-
- tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int));
- tile_info->from_render = task.denoising_from_render ? 1 : 0;
-
- tile_info->frames[0] = 0;
- tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
- for (int i = 1; i < tile_info->num_frames; i++) {
- tile_info->frames[i] = task.denoising_frames[i - 1];
- }
-
- do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
- do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
-}
-
-DenoisingTask::~DenoisingTask()
-{
- storage.XtWX.free();
- storage.XtWY.free();
- storage.transform.free();
- storage.rank.free();
- buffer.mem.free();
- buffer.temporary_mem.free();
- tile_info_mem.free();
-}
-
-void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors)
-{
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- RenderTile &rtile = neighbors.tiles[i];
- tile_info->offsets[i] = rtile.offset;
- tile_info->strides[i] = rtile.stride;
- tile_info->buffers[i] = rtile.buffer;
- }
- tile_info->x[0] = neighbors.tiles[3].x;
- tile_info->x[1] = neighbors.tiles[4].x;
- tile_info->x[2] = neighbors.tiles[5].x;
- tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
- tile_info->y[0] = neighbors.tiles[1].y;
- tile_info->y[1] = neighbors.tiles[4].y;
- tile_info->y[2] = neighbors.tiles[7].y;
- tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-
- target_buffer.offset = neighbors.target.offset;
- target_buffer.stride = neighbors.target.stride;
- target_buffer.ptr = neighbors.target.buffer;
-
- if (do_prefilter && neighbors.target.buffers) {
- target_buffer.denoising_output_offset =
- neighbors.target.buffers->params.get_denoising_prefiltered_offset();
- }
- else {
- target_buffer.denoising_output_offset = 0;
- }
-
- tile_info_mem.copy_to_device();
-}
-
-void DenoisingTask::setup_denoising_buffer()
-{
- /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring
- * tiles */
- rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
- rect = rect_expand(rect, radius);
- rect = rect_clip(rect,
- make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
-
- buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1);
- buffer.passes = buffer.use_intensity ? 15 : 14;
- buffer.width = rect.z - rect.x;
- buffer.stride = align_up(buffer.width, 4);
- buffer.h = rect.w - rect.y;
- int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
- buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
- buffer.frame_stride = buffer.pass_stride * buffer.passes;
- /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
- int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
- buffer.mem.alloc_to_device(mem_size, false);
- buffer.use_time = (tile_info->num_frames > 1);
-
- /* CPUs process shifts sequentially while GPUs process them in parallel. */
- int num_layers;
- if (buffer.gpu_temporary_mem) {
- /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
- int max_radius = max(radius, 6);
- int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1);
- num_layers = 2 * num_shifts + 1;
- }
- else {
- num_layers = 3;
- }
- /* Allocate two layers per shift as well as one for the weight accumulation. */
- buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
-}
-
-void DenoisingTask::prefilter_shadowing()
-{
- device_ptr null_ptr = (device_ptr)0;
-
- device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride);
- device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride);
-
- /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the
- * sample variance and the buffer variance. */
- functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
-
- /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the
- * sample variance. */
- nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
- functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
-
- /* Reuse memory, the previous data isn't needed anymore. */
- device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
- /* Use the smoothed variance to filter the two shadow half images using each other for weight
- * calculation. */
- nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
- functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
- functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
-
- device_ptr residual_var = *sample_var_var;
- /* Estimate the residual variance between the two filtered halves. */
- functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
-
- device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b;
- /* Use the residual variance for a second filter pass. */
- nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
- functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
- functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
-
- /* Combine the two double-filtered halves to a final shadow feature. */
- device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride);
- functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
-}
-
-void DenoisingTask::prefilter_features()
-{
- device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride);
-
- int mean_from[] = {0, 1, 2, 12, 6, 7, 8};
- int variance_from[] = {3, 4, 5, 13, 9, 10, 11};
- int pass_to[] = {1, 2, 3, 0, 5, 6, 7};
- for (int pass = 0; pass < 7; pass++) {
- device_sub_ptr feature_pass(
- buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride);
- /* Get the unfiltered pass and its variance from the RenderBuffers. */
- functions.get_feature(mean_from[pass],
- variance_from[pass],
- *unfiltered,
- *variance,
- 1.0f / render_buffer.samples);
- /* Smooth the pass and store the result in the denoising buffers. */
- nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
- functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
- }
-}
-
-void DenoisingTask::prefilter_color()
-{
- int mean_from[] = {20, 21, 22};
- int variance_from[] = {23, 24, 25};
- int mean_to[] = {8, 9, 10};
- int variance_to[] = {11, 12, 13};
- int num_color_passes = 3;
-
- device_only_memory<float> temporary_color(device, "denoising temporary color");
- temporary_color.alloc_to_device(6 * buffer.pass_stride, false);
-
- for (int pass = 0; pass < num_color_passes; pass++) {
- device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr color_var_pass(
- temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride);
- functions.get_feature(mean_from[pass],
- variance_from[pass],
- *color_pass,
- *color_var_pass,
- 1.0f / render_buffer.samples);
- }
-
- device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride);
- device_sub_ptr color_var_pass(
- buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
- device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
- functions.detect_outliers(
- temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
-
- if (buffer.use_intensity) {
- device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
- nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true);
- functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
- }
-}
-
-void DenoisingTask::load_buffer()
-{
- device_ptr null_ptr = (device_ptr)0;
-
- int original_offset = render_buffer.offset;
-
- int num_passes = buffer.use_intensity ? 15 : 14;
- for (int i = 0; i < tile_info->num_frames; i++) {
- for (int pass = 0; pass < num_passes; pass++) {
- device_sub_ptr to_pass(
- buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride);
- bool is_variance = (pass >= 11) && (pass <= 13);
- functions.get_feature(
- pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f);
- }
- render_buffer.offset += render_buffer.frame_stride;
- }
-
- render_buffer.offset = original_offset;
-}
-
-void DenoisingTask::write_buffer()
-{
- reconstruction_state.buffer_params = make_int4(target_buffer.offset,
- target_buffer.stride,
- target_buffer.pass_stride,
- target_buffer.denoising_clean_offset);
- int num_passes = buffer.use_intensity ? 15 : 14;
- for (int pass = 0; pass < num_passes; pass++) {
- device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride);
- int out_offset = pass + target_buffer.denoising_output_offset;
- functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
- }
-}
-
-void DenoisingTask::construct_transform()
-{
- storage.w = filter_area.z;
- storage.h = filter_area.w;
-
- storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false);
- storage.rank.alloc_to_device(storage.w * storage.h, false);
-
- functions.construct_transform();
-}
-
-void DenoisingTask::reconstruct()
-{
- storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false);
- storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false);
- storage.XtWX.zero_to_device();
- storage.XtWY.zero_to_device();
-
- reconstruction_state.filter_window = rect_from_shape(
- filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h);
- int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x;
- reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
- target_buffer.stride,
- target_buffer.pass_stride,
- target_buffer.denoising_clean_offset);
- reconstruction_state.source_w = rect.z - rect.x;
- reconstruction_state.source_h = rect.w - rect.y;
-
- device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride);
- device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride);
- for (int f = 0; f < tile_info->num_frames; f++) {
- device_ptr scale_ptr = 0;
- device_sub_ptr *scale_sub_ptr = NULL;
- if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
- scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
- scale_ptr = **scale_sub_ptr;
- }
-
- functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
- delete scale_sub_ptr;
- }
- functions.solve(target_buffer.ptr);
-}
-
-void DenoisingTask::run_denoising(RenderTile &tile)
-{
- RenderTileNeighbors neighbors(tile);
- functions.map_neighbor_tiles(neighbors);
- set_render_buffer(neighbors);
-
- setup_denoising_buffer();
-
- if (tile_info->from_render) {
- prefilter_shadowing();
- prefilter_features();
- prefilter_color();
- }
- else {
- load_buffer();
- }
-
- if (do_filter) {
- construct_transform();
- reconstruct();
- }
-
- if (do_prefilter) {
- write_buffer();
- }
-
- functions.unmap_neighbor_tiles(neighbors);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
deleted file mode 100644
index bb8bdfdd225..00000000000
--- a/intern/cycles/device/device_denoising.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_DENOISING_H__
-#define __DEVICE_DENOISING_H__
-
-#include "device/device.h"
-
-#include "render/buffers.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "util/util_profiling.h"
-
-CCL_NAMESPACE_BEGIN
-
-class DenoisingTask {
- public:
- /* Parameters of the denoising algorithm. */
- int radius;
- float nlm_k_2;
- float pca_threshold;
-
- /* Parameters of the RenderBuffers. */
- struct RenderBuffers {
- int offset;
- int pass_stride;
- int frame_stride;
- int samples;
- } render_buffer;
-
- /* Pointer and parameters of the target buffer. */
- struct TargetBuffer {
- int offset;
- int stride;
- int pass_stride;
- int denoising_clean_offset;
- int denoising_output_offset;
- device_ptr ptr;
- } target_buffer;
-
- TileInfo *tile_info;
- device_vector<int> tile_info_mem;
-
- ProfilingState *profiler;
-
- int4 rect;
- int4 filter_area;
-
- bool do_prefilter;
- bool do_filter;
-
- struct DeviceFunctions {
- function<bool(
- device_ptr image_ptr, /* Contains the values that are smoothed. */
- device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
- device_ptr variance_ptr, /* Contains the variance of the guide image. */
- device_ptr out_ptr /* The filtered output is written into this image. */
- )>
- non_local_means;
- function<bool(
- device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)>
- accumulate;
- function<bool(device_ptr output_ptr)> solve;
- function<bool()> construct_transform;
-
- function<bool(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect)>
- combine_halves;
- function<bool(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr)>
- divide_shadow;
- function<bool(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale)>
- get_feature;
- function<bool(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr)>
- detect_outliers;
- function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
- function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
- function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
- } functions;
-
- /* Stores state of the current Reconstruction operation,
- * which is accessed by the device in order to perform the operation. */
- struct ReconstructionState {
- int4 filter_window;
- int4 buffer_params;
-
- int source_w;
- int source_h;
- } reconstruction_state;
-
- /* Stores state of the current NLM operation,
- * which is accessed by the device in order to perform the operation. */
- struct NLMState {
- int r; /* Search radius of the filter. */
- int f; /* Patch size of the filter. */
- float a; /* Variance compensation factor in the MSE estimation. */
- float k_2; /* Squared value of the k parameter of the filter. */
- bool is_color;
-
- void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
- {
- r = r_;
- f = f_;
- a = a_, k_2 = k_2_;
- is_color = is_color_;
- }
- } nlm_state;
-
- struct Storage {
- device_only_memory<float> transform;
- device_only_memory<int> rank;
- device_only_memory<float> XtWX;
- device_only_memory<float3> XtWY;
- int w;
- int h;
-
- Storage(Device *device)
- : transform(device, "denoising transform"),
- rank(device, "denoising rank"),
- XtWX(device, "denoising XtWX"),
- XtWY(device, "denoising XtWY")
- {
- }
- } storage;
-
- DenoisingTask(Device *device, const DeviceTask &task);
- ~DenoisingTask();
-
- void run_denoising(RenderTile &tile);
-
- struct DenoiseBuffers {
- int pass_stride;
- int passes;
- int stride;
- int h;
- int width;
- int frame_stride;
- device_only_memory<float> mem;
- device_only_memory<float> temporary_mem;
- bool use_time;
- bool use_intensity;
-
- bool gpu_temporary_mem;
-
- DenoiseBuffers(Device *device)
- : mem(device, "denoising pixel buffer"),
- temporary_mem(device, "denoising temporary mem", true)
- {
- }
- } buffer;
-
- protected:
- Device *device;
-
- void set_render_buffer(RenderTileNeighbors &neighbors);
- void setup_denoising_buffer();
- void prefilter_shadowing();
- void prefilter_features();
- void prefilter_color();
- void construct_transform();
- void reconstruct();
-
- void load_buffer();
- void write_buffer();
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_graphics_interop.cpp b/intern/cycles/device/device_graphics_interop.cpp
new file mode 100644
index 00000000000..a80a236759f
--- /dev/null
+++ b/intern/cycles/device/device_graphics_interop.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_graphics_interop.h"
+
+CCL_NAMESPACE_BEGIN
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h
new file mode 100644
index 00000000000..671b1c189d7
--- /dev/null
+++ b/intern/cycles/device/device_graphics_interop.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Information about interoperability destination.
+ * Is provided by the GPUDisplay. */
+class DeviceGraphicsInteropDestination {
+ public:
+ /* Dimensions of the buffer, in pixels. */
+ int buffer_width = 0;
+ int buffer_height = 0;
+
+ /* OpenGL pixel buffer object. */
+ int opengl_pbo_id = 0;
+
+ /* Clear the entire destination before doing partial write to it. */
+ bool need_clear = false;
+};
+
+/* Device-side graphics interoperability support.
+ *
+ * Takes care of holding all the handlers needed by the device to implement interoperability with
+ * the graphics library. */
+class DeviceGraphicsInterop {
+ public:
+ DeviceGraphicsInterop() = default;
+ virtual ~DeviceGraphicsInterop() = default;
+
+ /* Update this device-side graphics interoperability object with the given destination resource
+ * information. */
+ virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0;
+
+ virtual device_ptr map() = 0;
+ virtual void unmap() = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
deleted file mode 100644
index ecc79c5d7ee..00000000000
--- a/intern/cycles/device/device_intern.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_INTERN_H__
-#define __DEVICE_INTERN_H__
-
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class Device;
-class DeviceInfo;
-class Profiler;
-class Stats;
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_init();
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_compile_kernel(const vector<string> &parameters);
-bool device_cuda_init();
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_optix_init();
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-Device *device_network_create(DeviceInfo &info,
- Stats &stats,
- Profiler &profiler,
- const char *address);
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-void device_cpu_info(vector<DeviceInfo> &devices);
-void device_opencl_info(vector<DeviceInfo> &devices);
-void device_cuda_info(vector<DeviceInfo> &devices);
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
-void device_network_info(vector<DeviceInfo> &devices);
-
-string device_cpu_capabilities();
-string device_opencl_capabilities();
-string device_cuda_capabilities();
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
new file mode 100644
index 00000000000..ceaddee4756
--- /dev/null
+++ b/intern/cycles/device/device_kernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_kernel.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel)
+{
+ switch (kernel) {
+ /* Integrator. */
+ case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
+ return "integrator_init_from_camera";
+ case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
+ return "integrator_init_from_bake";
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+ return "integrator_intersect_closest";
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+ return "integrator_intersect_shadow";
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+ return "integrator_intersect_subsurface";
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+ return "integrator_intersect_volume_stack";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+ return "integrator_shade_background";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+ return "integrator_shade_light";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+ return "integrator_shade_shadow";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+ return "integrator_shade_surface";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+ return "integrator_shade_surface_raytrace";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+ return "integrator_shade_volume";
+ case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+ return "integrator_megakernel";
+ case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+ return "integrator_queued_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+ return "integrator_queued_shadow_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+ return "integrator_active_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+ return "integrator_terminated_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+ return "integrator_sorted_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+ return "integrator_compact_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+ return "integrator_compact_states";
+ case DEVICE_KERNEL_INTEGRATOR_RESET:
+ return "integrator_reset";
+ case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+ return "integrator_shadow_catcher_count_possible_splits";
+
+ /* Shader evaluation. */
+ case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+ return "shader_eval_displace";
+ case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+ return "shader_eval_background";
+
+ /* Film. */
+
+#define FILM_CONVERT_KERNEL_AS_STRING(variant, variant_lowercase) \
+ case DEVICE_KERNEL_FILM_CONVERT_##variant: \
+ return "film_convert_" #variant_lowercase; \
+ case DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA: \
+ return "film_convert_" #variant_lowercase "_half_rgba";
+
+ FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth)
+ FILM_CONVERT_KERNEL_AS_STRING(MIST, mist)
+ FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
+ FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
+ FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path)
+ FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3)
+ FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion)
+ FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte)
+ FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER, shadow_catcher)
+ FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER_MATTE_WITH_SHADOW,
+ shadow_catcher_matte_with_shadow)
+ FILM_CONVERT_KERNEL_AS_STRING(COMBINED, combined)
+ FILM_CONVERT_KERNEL_AS_STRING(FLOAT4, float4)
+
+#undef FILM_CONVERT_KERNEL_AS_STRING
+
+ /* Adaptive sampling. */
+ case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
+ return "adaptive_sampling_convergence_check";
+ case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X:
+ return "adaptive_sampling_filter_x";
+ case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y:
+ return "adaptive_sampling_filter_y";
+
+ /* Denoising. */
+ case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS:
+ return "filter_guiding_preprocess";
+ case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO:
+ return "filter_guiding_set_fake_albedo";
+ case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS:
+ return "filter_color_preprocess";
+ case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS:
+ return "filter_color_postprocess";
+
+ /* Cryptomatte. */
+ case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
+ return "cryptomatte_postprocess";
+
+ /* Generic */
+ case DEVICE_KERNEL_PREFIX_SUM:
+ return "prefix_sum";
+
+ case DEVICE_KERNEL_NUM:
+ break;
+ };
+ LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen.";
+ return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
+{
+ os << device_kernel_as_string(kernel);
+ return os;
+}
+
+string device_kernel_mask_as_string(DeviceKernelMask mask)
+{
+ string str;
+
+ for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) {
+ if (mask & (uint64_t(1) << i)) {
+ if (!str.empty()) {
+ str += " ";
+ }
+ str += device_kernel_as_string((DeviceKernel)i);
+ }
+ }
+
+ return str;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_kernel.h b/intern/cycles/device/device_kernel.h
new file mode 100644
index 00000000000..83d959ca87b
--- /dev/null
+++ b/intern/cycles/device/device_kernel.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_string.h"
+
+#include <ostream> // NOLINT
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel);
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
+
+typedef uint64_t DeviceKernelMask;
+string device_kernel_mask_as_string(DeviceKernelMask mask);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 80a05fc32fe..c4d45829b83 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
device_memory::device_memory(Device *device, const char *name, MemoryType type)
: data_type(device_type_traits<uchar>::data_type),
- data_elements(device_type_traits<uchar>::num_elements),
+ data_elements(device_type_traits<uchar>::num_elements_cpu),
data_size(0),
device_size(0),
data_width(0),
@@ -149,6 +149,11 @@ void device_memory::device_zero()
}
}
+bool device_memory::device_is_cpu()
+{
+ return (device->info.type == DEVICE_CPU);
+}
+
void device_memory::swap_device(Device *new_device,
size_t new_device_size,
device_ptr new_device_ptr)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 80f4d7b0468..c51594b8580 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -38,7 +38,6 @@ enum MemoryType {
MEM_DEVICE_ONLY,
MEM_GLOBAL,
MEM_TEXTURE,
- MEM_PIXELS
};
/* Supported Data Types */
@@ -54,7 +53,7 @@ enum DataType {
TYPE_UINT64,
};
-static inline size_t datatype_size(DataType datatype)
+static constexpr size_t datatype_size(DataType datatype)
{
switch (datatype) {
case TYPE_UNKNOWN:
@@ -82,112 +81,155 @@ static inline size_t datatype_size(DataType datatype)
template<typename T> struct device_type_traits {
static const DataType data_type = TYPE_UNKNOWN;
- static const int num_elements = sizeof(T);
+ static const int num_elements_cpu = sizeof(T);
+ static const int num_elements_gpu = sizeof(T);
};
template<> struct device_type_traits<uchar> {
static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uchar2> {
static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 2;
+ static const int num_elements_cpu = 2;
+ static const int num_elements_gpu = 2;
+ static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uchar3> {
static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 3;
+ static const int num_elements_cpu = 3;
+ static const int num_elements_gpu = 3;
+ static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uchar4> {
static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint> {
static const DataType data_type = TYPE_UINT;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint2> {
static const DataType data_type = TYPE_UINT;
- static const int num_elements = 2;
+ static const int num_elements_cpu = 2;
+ static const int num_elements_gpu = 2;
+ static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint3> {
static const DataType data_type = TYPE_UINT;
- static const int num_elements = 3;
+ static const int num_elements_cpu = 3;
+ static const int num_elements_gpu = 3;
+ static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint4> {
static const DataType data_type = TYPE_UINT;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<int> {
static const DataType data_type = TYPE_INT;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<int2> {
static const DataType data_type = TYPE_INT;
- static const int num_elements = 2;
+ static const int num_elements_cpu = 2;
+ static const int num_elements_gpu = 2;
+ static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<int3> {
static const DataType data_type = TYPE_INT;
- static const int num_elements = 3;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 3;
+ static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<int4> {
static const DataType data_type = TYPE_INT;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<float> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<float2> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 2;
+ static const int num_elements_cpu = 2;
+ static const int num_elements_gpu = 2;
+ static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<float3> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 3;
+ static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<float4> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<half> {
static const DataType data_type = TYPE_HALF;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<ushort4> {
static const DataType data_type = TYPE_UINT16;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint16_t> {
static const DataType data_type = TYPE_UINT16;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<half4> {
static const DataType data_type = TYPE_HALF;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint64_t> {
static const DataType data_type = TYPE_UINT64;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
};
/* Device Memory
@@ -257,6 +299,8 @@ class device_memory {
void device_copy_from(int y, int w, int h, int elem);
void device_zero();
+ bool device_is_cpu();
+
device_ptr original_device_ptr;
size_t original_device_size;
Device *original_device;
@@ -275,7 +319,9 @@ template<typename T> class device_only_memory : public device_memory {
: device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
{
data_type = device_type_traits<T>::data_type;
- data_elements = max(device_type_traits<T>::num_elements, 1);
+ data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu :
+ device_type_traits<T>::num_elements_gpu,
+ 1);
}
device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other))
@@ -331,11 +377,15 @@ template<typename T> class device_only_memory : public device_memory {
template<typename T> class device_vector : public device_memory {
public:
+ /* Can only use this for types that have the same size on CPU and GPU. */
+ static_assert(device_type_traits<T>::num_elements_cpu ==
+ device_type_traits<T>::num_elements_gpu);
+
device_vector(Device *device, const char *name, MemoryType type)
: device_memory(device, name, type)
{
data_type = device_type_traits<T>::data_type;
- data_elements = device_type_traits<T>::num_elements;
+ data_elements = device_type_traits<T>::num_elements_cpu;
modified = true;
need_realloc_ = true;
@@ -477,6 +527,11 @@ template<typename T> class device_vector : public device_memory {
return (T *)host_pointer;
}
+ const T *data() const
+ {
+ return (T *)host_pointer;
+ }
+
T &operator[](size_t i)
{
assert(i < data_size);
@@ -507,7 +562,7 @@ template<typename T> class device_vector : public device_memory {
void copy_from_device()
{
- device_copy_from(0, data_width, data_height, sizeof(T));
+ device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T));
}
void copy_from_device(int y, int w, int h)
@@ -535,33 +590,6 @@ template<typename T> class device_vector : public device_memory {
}
};
-/* Pixel Memory
- *
- * Device memory to efficiently draw as pixels to the screen in interactive
- * rendering. Only copying pixels from the device is supported, not copying to. */
-
-template<typename T> class device_pixels : public device_vector<T> {
- public:
- device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS)
- {
- }
-
- void alloc_to_device(size_t width, size_t height, size_t depth = 0)
- {
- device_vector<T>::alloc(width, height, depth);
-
- if (!device_memory::device_pointer) {
- device_memory::device_alloc();
- }
- }
-
- T *copy_from_device(int y, int w, int h)
- {
- device_memory::device_copy_from(y, w, h, sizeof(T));
- return device_vector<T>::data();
- }
-};
-
/* Device Sub Memory
*
* Pointer into existing memory. It is not allocated separately, but created
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
deleted file mode 100644
index 85ffa5fcd52..00000000000
--- a/intern/cycles/device/device_multi.cpp
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sstream>
-#include <stdlib.h>
-
-#include "bvh/bvh_multi.h"
-
-#include "device/device.h"
-#include "device/device_intern.h"
-#include "device/device_network.h"
-
-#include "render/buffers.h"
-#include "render/geometry.h"
-
-#include "util/util_foreach.h"
-#include "util/util_list.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-class MultiDevice : public Device {
- public:
- struct SubDevice {
- Stats stats;
- Device *device;
- map<device_ptr, device_ptr> ptr_map;
- int peer_island_index = -1;
- };
-
- list<SubDevice> devices, denoising_devices;
- device_ptr unique_key;
- vector<vector<SubDevice *>> peer_islands;
- bool use_denoising;
- bool matching_rendering_and_denoising_devices;
-
- MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
- : Device(info, stats, profiler, background_),
- unique_key(1),
- use_denoising(!info.denoising_devices.empty())
- {
- foreach (DeviceInfo &subinfo, info.multi_devices) {
- /* Always add CPU devices at the back since GPU devices can change
- * host memory pointers, which CPU uses as device pointer. */
- SubDevice *sub;
- if (subinfo.type == DEVICE_CPU) {
- devices.emplace_back();
- sub = &devices.back();
- }
- else {
- devices.emplace_front();
- sub = &devices.front();
- }
-
- /* The pointer to 'sub->stats' will stay valid even after new devices
- * are added, since 'devices' is a linked list. */
- sub->device = Device::create(subinfo, sub->stats, profiler, background);
- }
-
- foreach (DeviceInfo &subinfo, info.denoising_devices) {
- denoising_devices.emplace_front();
- SubDevice *sub = &denoising_devices.front();
-
- sub->device = Device::create(subinfo, sub->stats, profiler, background);
- }
-
- /* Build a list of peer islands for the available render devices */
- foreach (SubDevice &sub, devices) {
- /* First ensure that every device is in at least once peer island */
- if (sub.peer_island_index < 0) {
- peer_islands.emplace_back();
- sub.peer_island_index = (int)peer_islands.size() - 1;
- peer_islands[sub.peer_island_index].push_back(&sub);
- }
-
- if (!info.has_peer_memory) {
- continue;
- }
-
- /* Second check peer access between devices and fill up the islands accordingly */
- foreach (SubDevice &peer_sub, devices) {
- if (peer_sub.peer_island_index < 0 &&
- peer_sub.device->info.type == sub.device->info.type &&
- peer_sub.device->check_peer_access(sub.device)) {
- peer_sub.peer_island_index = sub.peer_island_index;
- peer_islands[sub.peer_island_index].push_back(&peer_sub);
- }
- }
- }
-
- /* Try to re-use memory when denoising and render devices use the same physical devices
- * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU).
- * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */
- matching_rendering_and_denoising_devices = denoising_devices.empty() ||
- (devices.size() == denoising_devices.size());
- if (matching_rendering_and_denoising_devices) {
- for (list<SubDevice>::iterator device_it = devices.begin(),
- denoising_device_it = denoising_devices.begin();
- device_it != devices.end() && denoising_device_it != denoising_devices.end();
- ++device_it, ++denoising_device_it) {
- const DeviceInfo &info = device_it->device->info;
- const DeviceInfo &denoising_info = denoising_device_it->device->info;
- if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) ||
- (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) ||
- info.num != denoising_info.num) {
- matching_rendering_and_denoising_devices = false;
- break;
- }
- }
- }
-
-#ifdef WITH_NETWORK
- /* try to add network devices */
- ServerDiscovery discovery(true);
- time_sleep(1.0);
-
- vector<string> servers = discovery.get_server_list();
-
- foreach (string &server, servers) {
- Device *device = device_network_create(info, stats, profiler, server.c_str());
- if (device)
- devices.push_back(SubDevice(device));
- }
-#endif
- }
-
- ~MultiDevice()
- {
- foreach (SubDevice &sub, devices)
- delete sub.device;
- foreach (SubDevice &sub, denoising_devices)
- delete sub.device;
- }
-
- const string &error_message() override
- {
- error_msg.clear();
-
- foreach (SubDevice &sub, devices)
- error_msg += sub.device->error_message();
- foreach (SubDevice &sub, denoising_devices)
- error_msg += sub.device->error_message();
-
- return error_msg;
- }
-
- virtual bool show_samples() const override
- {
- if (devices.size() > 1) {
- return false;
- }
- return devices.front().device->show_samples();
- }
-
- virtual BVHLayoutMask get_bvh_layout_mask() const override
- {
- BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
- BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
- foreach (const SubDevice &sub_device, devices) {
- BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
- bvh_layout_mask &= device_bvh_layout_mask;
- bvh_layout_mask_all |= device_bvh_layout_mask;
- }
-
- /* With multiple OptiX devices, every device needs its own acceleration structure */
- if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
- return BVH_LAYOUT_MULTI_OPTIX;
- }
-
- /* When devices do not share a common BVH layout, fall back to creating one for each */
- const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
- if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
- return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
- }
-
- return bvh_layout_mask;
- }
-
- bool load_kernels(const DeviceRequestedFeatures &requested_features) override
- {
- foreach (SubDevice &sub, devices)
- if (!sub.device->load_kernels(requested_features))
- return false;
-
- use_denoising = requested_features.use_denoising;
- if (requested_features.use_denoising) {
- /* Only need denoising feature, everything else is unused. */
- DeviceRequestedFeatures denoising_features;
- denoising_features.use_denoising = true;
- foreach (SubDevice &sub, denoising_devices)
- if (!sub.device->load_kernels(denoising_features))
- return false;
- }
-
- return true;
- }
-
- bool wait_for_availability(const DeviceRequestedFeatures &requested_features) override
- {
- foreach (SubDevice &sub, devices)
- if (!sub.device->wait_for_availability(requested_features))
- return false;
-
- if (requested_features.use_denoising) {
- foreach (SubDevice &sub, denoising_devices)
- if (!sub.device->wait_for_availability(requested_features))
- return false;
- }
-
- return true;
- }
-
- DeviceKernelStatus get_active_kernel_switch_state() override
- {
- DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
-
- foreach (SubDevice &sub, devices) {
- DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
- switch (subresult) {
- case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
- case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
- return subresult;
-
- case DEVICE_KERNEL_USING_FEATURE_KERNEL:
- case DEVICE_KERNEL_UNKNOWN:
- break;
- }
- }
-
- return result;
- }
-
- void build_bvh(BVH *bvh, Progress &progress, bool refit) override
- {
- /* Try to build and share a single acceleration structure, if possible */
- if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
- devices.back().device->build_bvh(bvh, progress, refit);
- return;
- }
-
- assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
- bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
-
- BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
- bvh_multi->sub_bvhs.resize(devices.size());
-
- vector<BVHMulti *> geom_bvhs;
- geom_bvhs.reserve(bvh->geometry.size());
- foreach (Geometry *geom, bvh->geometry) {
- geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
- }
-
- /* Broadcast acceleration structure build to all render devices */
- size_t i = 0;
- foreach (SubDevice &sub, devices) {
- /* Change geometry BVH pointers to the sub BVH */
- for (size_t k = 0; k < bvh->geometry.size(); ++k) {
- bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
- }
-
- if (!bvh_multi->sub_bvhs[i]) {
- BVHParams params = bvh->params;
- if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
- params.bvh_layout = BVH_LAYOUT_OPTIX;
- else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
- params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
- BVH_LAYOUT_EMBREE;
-
- /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
- * (since they are put into the top level directly, see bvh_embree.cpp) */
- if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
- !bvh->geometry[0]->is_instanced()) {
- i++;
- continue;
- }
-
- bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
- }
-
- sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
- i++;
- }
-
- /* Change geometry BVH pointers back to the multi BVH. */
- for (size_t k = 0; k < bvh->geometry.size(); ++k) {
- bvh->geometry[k]->bvh = geom_bvhs[k];
- }
- }
-
- virtual void *osl_memory() override
- {
- if (devices.size() > 1) {
- return NULL;
- }
- return devices.front().device->osl_memory();
- }
-
- bool is_resident(device_ptr key, Device *sub_device) override
- {
- foreach (SubDevice &sub, devices) {
- if (sub.device == sub_device) {
- return find_matching_mem_device(key, sub)->device == sub_device;
- }
- }
- return false;
- }
-
- SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
- {
- assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
-
- /* Get the memory owner of this key (first try current device, then peer devices) */
- SubDevice *owner_sub = &sub;
- if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
- foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
- if (island_sub != owner_sub &&
- island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
- owner_sub = island_sub;
- }
- }
- }
- return owner_sub;
- }
-
- SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
- {
- assert(!island.empty());
-
- /* Get the memory owner of this key or the device with the lowest memory usage when new */
- SubDevice *owner_sub = island.front();
- foreach (SubDevice *island_sub, island) {
- if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
- (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
- owner_sub = island_sub;
- }
- }
- return owner_sub;
- }
-
- inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
- {
- return find_matching_mem_device(key, sub)->ptr_map[key];
- }
-
- void mem_alloc(device_memory &mem) override
- {
- device_ptr key = unique_key++;
-
- if (mem.type == MEM_PIXELS) {
- /* Always allocate pixels memory on all devices
- * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- sub.device->mem_alloc(mem);
- sub.ptr_map[key] = mem.device_pointer;
- }
- }
- else {
- assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
- mem.type == MEM_DEVICE_ONLY);
- /* The remaining memory types can be distributed across devices */
- foreach (const vector<SubDevice *> &island, peer_islands) {
- SubDevice *owner_sub = find_suitable_mem_device(key, island);
- mem.device = owner_sub->device;
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- owner_sub->device->mem_alloc(mem);
- owner_sub->ptr_map[key] = mem.device_pointer;
- }
- }
-
- mem.device = this;
- mem.device_pointer = key;
- stats.mem_alloc(mem.device_size);
- }
-
- void mem_copy_to(device_memory &mem) override
- {
- device_ptr existing_key = mem.device_pointer;
- device_ptr key = (existing_key) ? existing_key : unique_key++;
- size_t existing_size = mem.device_size;
-
- /* The tile buffers are allocated on each device (see below), so copy to all of them */
- if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- sub.device->mem_copy_to(mem);
- sub.ptr_map[key] = mem.device_pointer;
- }
- }
- else {
- foreach (const vector<SubDevice *> &island, peer_islands) {
- SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
- mem.device = owner_sub->device;
- mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- owner_sub->device->mem_copy_to(mem);
- owner_sub->ptr_map[key] = mem.device_pointer;
-
- if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
- /* Need to create texture objects and update pointer in kernel globals on all devices */
- foreach (SubDevice *island_sub, island) {
- if (island_sub != owner_sub) {
- island_sub->device->mem_copy_to(mem);
- }
- }
- }
- }
- }
-
- mem.device = this;
- mem.device_pointer = key;
- stats.mem_alloc(mem.device_size - existing_size);
- }
-
- void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
- {
- device_ptr key = mem.device_pointer;
- int i = 0, sub_h = h / devices.size();
-
- foreach (SubDevice &sub, devices) {
- int sy = y + i * sub_h;
- int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-
- SubDevice *owner_sub = find_matching_mem_device(key, sub);
- mem.device = owner_sub->device;
- mem.device_pointer = owner_sub->ptr_map[key];
-
- owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
- i++;
- }
-
- mem.device = this;
- mem.device_pointer = key;
- }
-
- void mem_zero(device_memory &mem) override
- {
- device_ptr existing_key = mem.device_pointer;
- device_ptr key = (existing_key) ? existing_key : unique_key++;
- size_t existing_size = mem.device_size;
-
- /* This is a hack to only allocate the tile buffers on denoising devices
- * Similarly the tile buffers also need to be allocated separately on all devices so any
- * overlap rendered for denoising does not interfere with each other */
- if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
- vector<device_ptr> device_pointers;
- device_pointers.reserve(devices.size());
-
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- sub.device->mem_zero(mem);
- sub.ptr_map[key] = mem.device_pointer;
-
- device_pointers.push_back(mem.device_pointer);
- }
- foreach (SubDevice &sub, denoising_devices) {
- if (matching_rendering_and_denoising_devices) {
- sub.ptr_map[key] = device_pointers.front();
- device_pointers.erase(device_pointers.begin());
- }
- else {
- mem.device = sub.device;
- mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- sub.device->mem_zero(mem);
- sub.ptr_map[key] = mem.device_pointer;
- }
- }
- }
- else {
- foreach (const vector<SubDevice *> &island, peer_islands) {
- SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
- mem.device = owner_sub->device;
- mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- owner_sub->device->mem_zero(mem);
- owner_sub->ptr_map[key] = mem.device_pointer;
- }
- }
-
- mem.device = this;
- mem.device_pointer = key;
- stats.mem_alloc(mem.device_size - existing_size);
- }
-
- void mem_free(device_memory &mem) override
- {
- device_ptr key = mem.device_pointer;
- size_t existing_size = mem.device_size;
-
- /* Free memory that was allocated for all devices (see above) on each device */
- if (mem.type == MEM_PIXELS || (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising)) {
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = sub.ptr_map[key];
- mem.device_size = existing_size;
-
- sub.device->mem_free(mem);
- sub.ptr_map.erase(sub.ptr_map.find(key));
- }
- foreach (SubDevice &sub, denoising_devices) {
- if (matching_rendering_and_denoising_devices) {
- sub.ptr_map.erase(key);
- }
- else {
- mem.device = sub.device;
- mem.device_pointer = sub.ptr_map[key];
- mem.device_size = existing_size;
-
- sub.device->mem_free(mem);
- sub.ptr_map.erase(sub.ptr_map.find(key));
- }
- }
- }
- else {
- foreach (const vector<SubDevice *> &island, peer_islands) {
- SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
- mem.device = owner_sub->device;
- mem.device_pointer = owner_sub->ptr_map[key];
- mem.device_size = existing_size;
-
- owner_sub->device->mem_free(mem);
- owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
-
- if (mem.type == MEM_TEXTURE) {
- /* Free texture objects on all devices */
- foreach (SubDevice *island_sub, island) {
- if (island_sub != owner_sub) {
- island_sub->device->mem_free(mem);
- }
- }
- }
- }
- }
-
- mem.device = this;
- mem.device_pointer = 0;
- mem.device_size = 0;
- stats.mem_free(existing_size);
- }
-
- void const_copy_to(const char *name, void *host, size_t size) override
- {
- foreach (SubDevice &sub, devices)
- sub.device->const_copy_to(name, host, size);
- }
-
- void draw_pixels(device_memory &rgba,
- int y,
- int w,
- int h,
- int width,
- int height,
- int dx,
- int dy,
- int dw,
- int dh,
- bool transparent,
- const DeviceDrawParams &draw_params) override
- {
- assert(rgba.type == MEM_PIXELS);
-
- device_ptr key = rgba.device_pointer;
- int i = 0, sub_h = h / devices.size();
- int sub_height = height / devices.size();
-
- foreach (SubDevice &sub, devices) {
- int sy = y + i * sub_h;
- int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
- int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height;
- int sdy = dy + i * sub_height;
- /* adjust math for w/width */
-
- rgba.device_pointer = sub.ptr_map[key];
- sub.device->draw_pixels(
- rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
- i++;
- }
-
- rgba.device_pointer = key;
- }
-
- void map_tile(Device *sub_device, RenderTile &tile) override
- {
- if (!tile.buffer) {
- return;
- }
-
- foreach (SubDevice &sub, devices) {
- if (sub.device == sub_device) {
- tile.buffer = find_matching_mem(tile.buffer, sub);
- return;
- }
- }
-
- foreach (SubDevice &sub, denoising_devices) {
- if (sub.device == sub_device) {
- tile.buffer = sub.ptr_map[tile.buffer];
- return;
- }
- }
- }
-
- int device_number(Device *sub_device) override
- {
- int i = 0;
-
- foreach (SubDevice &sub, devices) {
- if (sub.device == sub_device)
- return i;
- i++;
- }
-
- foreach (SubDevice &sub, denoising_devices) {
- if (sub.device == sub_device)
- return i;
- i++;
- }
-
- return -1;
- }
-
- void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
- {
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- RenderTile &tile = neighbors.tiles[i];
-
- if (!tile.buffers) {
- continue;
- }
-
- device_vector<float> &mem = tile.buffers->buffer;
- tile.buffer = mem.device_pointer;
-
- if (mem.device == this && matching_rendering_and_denoising_devices) {
- /* Skip unnecessary copies in viewport mode (buffer covers the
- * whole image), but still need to fix up the tile device pointer. */
- map_tile(sub_device, tile);
- continue;
- }
-
- /* If the tile was rendered on another device, copy its memory to
- * to the current device now, for the duration of the denoising task.
- * Note that this temporarily modifies the RenderBuffers and calls
- * the device, so this function is not thread safe. */
- if (mem.device != sub_device) {
- /* Only copy from device to host once. This is faster, but
- * also required for the case where a CPU thread is denoising
- * a tile rendered on the GPU. In that case we have to avoid
- * overwriting the buffer being de-noised by the CPU thread. */
- if (!tile.buffers->map_neighbor_copied) {
- tile.buffers->map_neighbor_copied = true;
- mem.copy_from_device();
- }
-
- if (mem.device == this) {
- /* Can re-use memory if tile is already allocated on the sub device. */
- map_tile(sub_device, tile);
- mem.swap_device(sub_device, mem.device_size, tile.buffer);
- }
- else {
- mem.swap_device(sub_device, 0, 0);
- }
-
- mem.copy_to_device();
-
- tile.buffer = mem.device_pointer;
- tile.device_size = mem.device_size;
-
- mem.restore_device();
- }
- }
- }
-
- void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
- {
- RenderTile &target_tile = neighbors.target;
- device_vector<float> &mem = target_tile.buffers->buffer;
-
- if (mem.device == this && matching_rendering_and_denoising_devices) {
- return;
- }
-
- /* Copy denoised result back to the host. */
- mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer);
- mem.copy_from_device();
- mem.restore_device();
-
- /* Copy denoised result to the original device. */
- mem.copy_to_device();
-
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- RenderTile &tile = neighbors.tiles[i];
- if (!tile.buffers) {
- continue;
- }
-
- device_vector<float> &mem = tile.buffers->buffer;
-
- if (mem.device != sub_device && mem.device != this) {
- /* Free up memory again if it was allocated for the copy above. */
- mem.swap_device(sub_device, tile.device_size, tile.buffer);
- sub_device->mem_free(mem);
- mem.restore_device();
- }
- }
- }
-
- int get_split_task_count(DeviceTask &task) override
- {
- int total_tasks = 0;
- list<DeviceTask> tasks;
- task.split(tasks, devices.size());
- foreach (SubDevice &sub, devices) {
- if (!tasks.empty()) {
- DeviceTask subtask = tasks.front();
- tasks.pop_front();
-
- total_tasks += sub.device->get_split_task_count(subtask);
- }
- }
- return total_tasks;
- }
-
- void task_add(DeviceTask &task) override
- {
- list<SubDevice> task_devices = devices;
- if (!denoising_devices.empty()) {
- if (task.type == DeviceTask::DENOISE_BUFFER) {
- /* Denoising tasks should be redirected to the denoising devices entirely. */
- task_devices = denoising_devices;
- }
- else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) {
- const uint tile_types = task.tile_types;
- /* For normal rendering tasks only redirect the denoising part to the denoising devices.
- * Do not need to split the task here, since they all run through 'acquire_tile'. */
- task.tile_types = RenderTile::DENOISE;
- foreach (SubDevice &sub, denoising_devices) {
- sub.device->task_add(task);
- }
- /* Rendering itself should still be executed on the rendering devices. */
- task.tile_types = tile_types ^ RenderTile::DENOISE;
- }
- }
-
- list<DeviceTask> tasks;
- task.split(tasks, task_devices.size());
-
- foreach (SubDevice &sub, task_devices) {
- if (!tasks.empty()) {
- DeviceTask subtask = tasks.front();
- tasks.pop_front();
-
- if (task.buffer)
- subtask.buffer = find_matching_mem(task.buffer, sub);
- if (task.rgba_byte)
- subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
- if (task.rgba_half)
- subtask.rgba_half = sub.ptr_map[task.rgba_half];
- if (task.shader_input)
- subtask.shader_input = find_matching_mem(task.shader_input, sub);
- if (task.shader_output)
- subtask.shader_output = find_matching_mem(task.shader_output, sub);
-
- sub.device->task_add(subtask);
-
- if (task.buffers && task.buffers->buffer.device == this) {
- /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */
- sub.device->task_wait();
- }
- }
- }
- }
-
- void task_wait() override
- {
- foreach (SubDevice &sub, devices)
- sub.device->task_wait();
- foreach (SubDevice &sub, denoising_devices)
- sub.device->task_wait();
- }
-
- void task_cancel() override
- {
- foreach (SubDevice &sub, devices)
- sub.device->task_cancel();
- foreach (SubDevice &sub, denoising_devices)
- sub.device->task_cancel();
- }
-};
-
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
- return new MultiDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
deleted file mode 100644
index 8904b517e92..00000000000
--- a/intern/cycles/device/device_network.cpp
+++ /dev/null
@@ -1,812 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_network.h"
-#include "device/device.h"
-#include "device/device_intern.h"
-
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-
-#if defined(WITH_NETWORK)
-
-CCL_NAMESPACE_BEGIN
-
-typedef map<device_ptr, device_ptr> PtrMap;
-typedef vector<uint8_t> DataVector;
-typedef map<device_ptr, DataVector> DataMap;
-
-/* tile list */
-typedef vector<RenderTile> TileList;
-
-/* search a list of tiles and find the one that matches the passed render tile */
-static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile)
-{
- for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
- if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
- return it;
- return tile_list.end();
-}
-
-class NetworkDevice : public Device {
- public:
- boost::asio::io_service io_service;
- tcp::socket socket;
- device_ptr mem_counter;
- DeviceTask the_task; /* todo: handle multiple tasks */
-
- thread_mutex rpc_lock;
-
- virtual bool show_samples() const
- {
- return false;
- }
-
- NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address)
- : Device(info, stats, profiler, true), socket(io_service)
- {
- error_func = NetworkError();
- stringstream portstr;
- portstr << SERVER_PORT;
-
- tcp::resolver resolver(io_service);
- tcp::resolver::query query(address, portstr.str());
- tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
- tcp::resolver::iterator end;
-
- boost::system::error_code error = boost::asio::error::host_not_found;
- while (error && endpoint_iterator != end) {
- socket.close();
- socket.connect(*endpoint_iterator++, error);
- }
-
- if (error)
- error_func.network_error(error.message());
-
- mem_counter = 0;
- }
-
- ~NetworkDevice()
- {
- RPCSend snd(socket, &error_func, "stop");
- snd.write();
- }
-
- virtual BVHLayoutMask get_bvh_layout_mask() const
- {
- return BVH_LAYOUT_BVH2;
- }
-
- void mem_alloc(device_memory &mem)
- {
- if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
- }
-
- thread_scoped_lock lock(rpc_lock);
-
- mem.device_pointer = ++mem_counter;
-
- RPCSend snd(socket, &error_func, "mem_alloc");
- snd.add(mem);
- snd.write();
- }
-
- void mem_copy_to(device_memory &mem)
- {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "mem_copy_to");
-
- snd.add(mem);
- snd.write();
- snd.write_buffer(mem.host_pointer, mem.memory_size());
- }
-
- void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
- {
- thread_scoped_lock lock(rpc_lock);
-
- size_t data_size = mem.memory_size();
-
- RPCSend snd(socket, &error_func, "mem_copy_from");
-
- snd.add(mem);
- snd.add(y);
- snd.add(w);
- snd.add(h);
- snd.add(elem);
- snd.write();
-
- RPCReceive rcv(socket, &error_func);
- rcv.read_buffer(mem.host_pointer, data_size);
- }
-
- void mem_zero(device_memory &mem)
- {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "mem_zero");
-
- snd.add(mem);
- snd.write();
- }
-
- void mem_free(device_memory &mem)
- {
- if (mem.device_pointer) {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "mem_free");
-
- snd.add(mem);
- snd.write();
-
- mem.device_pointer = 0;
- }
- }
-
- void const_copy_to(const char *name, void *host, size_t size)
- {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "const_copy_to");
-
- string name_string(name);
-
- snd.add(name_string);
- snd.add(size);
- snd.write();
- snd.write_buffer(host, size);
- }
-
- bool load_kernels(const DeviceRequestedFeatures &requested_features)
- {
- if (error_func.have_error())
- return false;
-
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "load_kernels");
- snd.add(requested_features.experimental);
- snd.add(requested_features.max_closure);
- snd.add(requested_features.max_nodes_group);
- snd.add(requested_features.nodes_features);
- snd.write();
-
- bool result;
- RPCReceive rcv(socket, &error_func);
- rcv.read(result);
-
- return result;
- }
-
- void task_add(DeviceTask &task)
- {
- thread_scoped_lock lock(rpc_lock);
-
- the_task = task;
-
- RPCSend snd(socket, &error_func, "task_add");
- snd.add(task);
- snd.write();
- }
-
- void task_wait()
- {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "task_wait");
- snd.write();
-
- lock.unlock();
-
- TileList the_tiles;
-
- /* todo: run this threaded for connecting to multiple clients */
- for (;;) {
- if (error_func.have_error())
- break;
-
- RenderTile tile;
-
- lock.lock();
- RPCReceive rcv(socket, &error_func);
-
- if (rcv.name == "acquire_tile") {
- lock.unlock();
-
- /* todo: watch out for recursive calls! */
- if (the_task.acquire_tile(this, tile)) { /* write return as bool */
- the_tiles.push_back(tile);
-
- lock.lock();
- RPCSend snd(socket, &error_func, "acquire_tile");
- snd.add(tile);
- snd.write();
- lock.unlock();
- }
- else {
- lock.lock();
- RPCSend snd(socket, &error_func, "acquire_tile_none");
- snd.write();
- lock.unlock();
- }
- }
- else if (rcv.name == "release_tile") {
- rcv.read(tile);
- lock.unlock();
-
- TileList::iterator it = tile_list_find(the_tiles, tile);
- if (it != the_tiles.end()) {
- tile.buffers = it->buffers;
- the_tiles.erase(it);
- }
-
- assert(tile.buffers != NULL);
-
- the_task.release_tile(tile);
-
- lock.lock();
- RPCSend snd(socket, &error_func, "release_tile");
- snd.write();
- lock.unlock();
- }
- else if (rcv.name == "task_wait_done") {
- lock.unlock();
- break;
- }
- else
- lock.unlock();
- }
- }
-
- void task_cancel()
- {
- thread_scoped_lock lock(rpc_lock);
- RPCSend snd(socket, &error_func, "task_cancel");
- snd.write();
- }
-
- int get_split_task_count(DeviceTask &)
- {
- return 1;
- }
-
- private:
- NetworkError error_func;
-};
-
-Device *device_network_create(DeviceInfo &info,
- Stats &stats,
- Profiler &profiler,
- const char *address)
-{
- return new NetworkDevice(info, stats, profiler, address);
-}
-
-void device_network_info(vector<DeviceInfo> &devices)
-{
- DeviceInfo info;
-
- info.type = DEVICE_NETWORK;
- info.description = "Network Device";
- info.id = "NETWORK";
- info.num = 0;
-
- /* todo: get this info from device */
- info.has_volume_decoupled = false;
- info.has_adaptive_stop_per_sample = false;
- info.has_osl = false;
- info.denoisers = DENOISER_NONE;
-
- devices.push_back(info);
-}
-
-class DeviceServer {
- public:
- thread_mutex rpc_lock;
-
- void network_error(const string &message)
- {
- error_func.network_error(message);
- }
-
- bool have_error()
- {
- return error_func.have_error();
- }
-
- DeviceServer(Device *device_, tcp::socket &socket_)
- : device(device_), socket(socket_), stop(false), blocked_waiting(false)
- {
- error_func = NetworkError();
- }
-
- void listen()
- {
- /* receive remote function calls */
- for (;;) {
- listen_step();
-
- if (stop)
- break;
- }
- }
-
- protected:
- void listen_step()
- {
- thread_scoped_lock lock(rpc_lock);
- RPCReceive rcv(socket, &error_func);
-
- if (rcv.name == "stop")
- stop = true;
- else
- process(rcv, lock);
- }
-
- /* create a memory buffer for a device buffer and insert it into mem_data */
- DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
- {
- /* create a new DataVector and insert it into mem_data */
- pair<DataMap::iterator, bool> data_ins = mem_data.insert(
- DataMap::value_type(client_pointer, DataVector()));
-
- /* make sure it was a unique insertion */
- assert(data_ins.second);
-
- /* get a reference to the inserted vector */
- DataVector &data_v = data_ins.first->second;
-
- /* size the vector */
- data_v.resize(data_size);
-
- return data_v;
- }
-
- DataVector &data_vector_find(device_ptr client_pointer)
- {
- DataMap::iterator i = mem_data.find(client_pointer);
- assert(i != mem_data.end());
- return i->second;
- }
-
- /* setup mapping and reverse mapping of client_pointer<->real_pointer */
- void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
- {
- pair<PtrMap::iterator, bool> mapins;
-
- /* insert mapping from client pointer to our real device pointer */
- mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
- assert(mapins.second);
-
- /* insert reverse mapping from real our device pointer to client pointer */
- mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
- assert(mapins.second);
- }
-
- device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
- {
- PtrMap::iterator i = ptr_map.find(client_pointer);
- assert(i != ptr_map.end());
- return i->second;
- }
-
- device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
- {
- PtrMap::iterator i = ptr_map.find(client_pointer);
- assert(i != ptr_map.end());
-
- device_ptr result = i->second;
-
- /* erase the mapping */
- ptr_map.erase(i);
-
- /* erase the reverse mapping */
- PtrMap::iterator irev = ptr_imap.find(result);
- assert(irev != ptr_imap.end());
- ptr_imap.erase(irev);
-
- /* erase the data vector */
- DataMap::iterator idata = mem_data.find(client_pointer);
- assert(idata != mem_data.end());
- mem_data.erase(idata);
-
- return result;
- }
-
- /* note that the lock must be already acquired upon entry.
- * This is necessary because the caller often peeks at
- * the header and delegates control to here when it doesn't
- * specifically handle the current RPC.
- * The lock must be unlocked before returning */
- void process(RPCReceive &rcv, thread_scoped_lock &lock)
- {
- if (rcv.name == "mem_alloc") {
- string name;
- network_device_memory mem(device);
- rcv.read(mem, name);
- lock.unlock();
-
- /* Allocate host side data buffer. */
- size_t data_size = mem.memory_size();
- device_ptr client_pointer = mem.device_pointer;
-
- DataVector &data_v = data_vector_insert(client_pointer, data_size);
- mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-
- /* Perform the allocation on the actual device. */
- device->mem_alloc(mem);
-
- /* Store a mapping to/from client_pointer and real device pointer. */
- pointer_mapping_insert(client_pointer, mem.device_pointer);
- }
- else if (rcv.name == "mem_copy_to") {
- string name;
- network_device_memory mem(device);
- rcv.read(mem, name);
- lock.unlock();
-
- size_t data_size = mem.memory_size();
- device_ptr client_pointer = mem.device_pointer;
-
- if (client_pointer) {
- /* Lookup existing host side data buffer. */
- DataVector &data_v = data_vector_find(client_pointer);
- mem.host_pointer = (void *)&data_v[0];
-
- /* Translate the client pointer to a real device pointer. */
- mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
- }
- else {
- /* Allocate host side data buffer. */
- DataVector &data_v = data_vector_insert(client_pointer, data_size);
- mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
- }
-
- /* Copy data from network into memory buffer. */
- rcv.read_buffer((uint8_t *)mem.host_pointer, data_size);
-
- /* Copy the data from the memory buffer to the device buffer. */
- device->mem_copy_to(mem);
-
- if (!client_pointer) {
- /* Store a mapping to/from client_pointer and real device pointer. */
- pointer_mapping_insert(client_pointer, mem.device_pointer);
- }
- }
- else if (rcv.name == "mem_copy_from") {
- string name;
- network_device_memory mem(device);
- int y, w, h, elem;
-
- rcv.read(mem, name);
- rcv.read(y);
- rcv.read(w);
- rcv.read(h);
- rcv.read(elem);
-
- device_ptr client_pointer = mem.device_pointer;
- mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-
- DataVector &data_v = data_vector_find(client_pointer);
-
- mem.host_pointer = (device_ptr) & (data_v[0]);
-
- device->mem_copy_from(mem, y, w, h, elem);
-
- size_t data_size = mem.memory_size();
-
- RPCSend snd(socket, &error_func, "mem_copy_from");
- snd.write();
- snd.write_buffer((uint8_t *)mem.host_pointer, data_size);
- lock.unlock();
- }
- else if (rcv.name == "mem_zero") {
- string name;
- network_device_memory mem(device);
- rcv.read(mem, name);
- lock.unlock();
-
- size_t data_size = mem.memory_size();
- device_ptr client_pointer = mem.device_pointer;
-
- if (client_pointer) {
- /* Lookup existing host side data buffer. */
- DataVector &data_v = data_vector_find(client_pointer);
- mem.host_pointer = (void *)&data_v[0];
-
- /* Translate the client pointer to a real device pointer. */
- mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
- }
- else {
- /* Allocate host side data buffer. */
- DataVector &data_v = data_vector_insert(client_pointer, data_size);
- mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0;
- }
-
- /* Zero memory. */
- device->mem_zero(mem);
-
- if (!client_pointer) {
- /* Store a mapping to/from client_pointer and real device pointer. */
- pointer_mapping_insert(client_pointer, mem.device_pointer);
- }
- }
- else if (rcv.name == "mem_free") {
- string name;
- network_device_memory mem(device);
-
- rcv.read(mem, name);
- lock.unlock();
-
- device_ptr client_pointer = mem.device_pointer;
-
- mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
-
- device->mem_free(mem);
- }
- else if (rcv.name == "const_copy_to") {
- string name_string;
- size_t size;
-
- rcv.read(name_string);
- rcv.read(size);
-
- vector<char> host_vector(size);
- rcv.read_buffer(&host_vector[0], size);
- lock.unlock();
-
- device->const_copy_to(name_string.c_str(), &host_vector[0], size);
- }
- else if (rcv.name == "load_kernels") {
- DeviceRequestedFeatures requested_features;
- rcv.read(requested_features.experimental);
- rcv.read(requested_features.max_closure);
- rcv.read(requested_features.max_nodes_group);
- rcv.read(requested_features.nodes_features);
-
- bool result;
- result = device->load_kernels(requested_features);
- RPCSend snd(socket, &error_func, "load_kernels");
- snd.add(result);
- snd.write();
- lock.unlock();
- }
- else if (rcv.name == "task_add") {
- DeviceTask task;
-
- rcv.read(task);
- lock.unlock();
-
- if (task.buffer)
- task.buffer = device_ptr_from_client_pointer(task.buffer);
-
- if (task.rgba_half)
- task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
-
- if (task.rgba_byte)
- task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
-
- if (task.shader_input)
- task.shader_input = device_ptr_from_client_pointer(task.shader_input);
-
- if (task.shader_output)
- task.shader_output = device_ptr_from_client_pointer(task.shader_output);
-
- task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
- task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
- task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample,
- this);
- task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
- task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
-
- device->task_add(task);
- }
- else if (rcv.name == "task_wait") {
- lock.unlock();
-
- blocked_waiting = true;
- device->task_wait();
- blocked_waiting = false;
-
- lock.lock();
- RPCSend snd(socket, &error_func, "task_wait_done");
- snd.write();
- lock.unlock();
- }
- else if (rcv.name == "task_cancel") {
- lock.unlock();
- device->task_cancel();
- }
- else if (rcv.name == "acquire_tile") {
- AcquireEntry entry;
- entry.name = rcv.name;
- rcv.read(entry.tile);
- acquire_queue.push_back(entry);
- lock.unlock();
- }
- else if (rcv.name == "acquire_tile_none") {
- AcquireEntry entry;
- entry.name = rcv.name;
- acquire_queue.push_back(entry);
- lock.unlock();
- }
- else if (rcv.name == "release_tile") {
- AcquireEntry entry;
- entry.name = rcv.name;
- acquire_queue.push_back(entry);
- lock.unlock();
- }
- else {
- cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
- lock.unlock();
- }
- }
-
- bool task_acquire_tile(Device *, RenderTile &tile)
- {
- thread_scoped_lock acquire_lock(acquire_mutex);
-
- bool result = false;
-
- RPCSend snd(socket, &error_func, "acquire_tile");
- snd.write();
-
- do {
- if (blocked_waiting)
- listen_step();
-
- /* todo: avoid busy wait loop */
- thread_scoped_lock lock(rpc_lock);
-
- if (!acquire_queue.empty()) {
- AcquireEntry entry = acquire_queue.front();
- acquire_queue.pop_front();
-
- if (entry.name == "acquire_tile") {
- tile = entry.tile;
-
- if (tile.buffer)
- tile.buffer = ptr_map[tile.buffer];
-
- result = true;
- break;
- }
- else if (entry.name == "acquire_tile_none") {
- break;
- }
- else {
- cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
- }
- }
- } while (acquire_queue.empty() && !stop && !have_error());
-
- return result;
- }
-
- void task_update_progress_sample()
- {
- ; /* skip */
- }
-
- void task_update_tile_sample(RenderTile &)
- {
- ; /* skip */
- }
-
- void task_release_tile(RenderTile &tile)
- {
- thread_scoped_lock acquire_lock(acquire_mutex);
-
- if (tile.buffer)
- tile.buffer = ptr_imap[tile.buffer];
-
- {
- thread_scoped_lock lock(rpc_lock);
- RPCSend snd(socket, &error_func, "release_tile");
- snd.add(tile);
- snd.write();
- lock.unlock();
- }
-
- do {
- if (blocked_waiting)
- listen_step();
-
- /* todo: avoid busy wait loop */
- thread_scoped_lock lock(rpc_lock);
-
- if (!acquire_queue.empty()) {
- AcquireEntry entry = acquire_queue.front();
- acquire_queue.pop_front();
-
- if (entry.name == "release_tile") {
- lock.unlock();
- break;
- }
- else {
- cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
- }
- }
- } while (acquire_queue.empty() && !stop);
- }
-
- bool task_get_cancel()
- {
- return false;
- }
-
- /* properties */
- Device *device;
- tcp::socket &socket;
-
- /* mapping of remote to local pointer */
- PtrMap ptr_map;
- PtrMap ptr_imap;
- DataMap mem_data;
-
- struct AcquireEntry {
- string name;
- RenderTile tile;
- };
-
- thread_mutex acquire_mutex;
- list<AcquireEntry> acquire_queue;
-
- bool stop;
- bool blocked_waiting;
-
- private:
- NetworkError error_func;
-
- /* todo: free memory and device (osl) on network error */
-};
-
-void Device::server_run()
-{
- try {
- /* starts thread that responds to discovery requests */
- ServerDiscovery discovery;
-
- for (;;) {
- /* accept connection */
- boost::asio::io_service io_service;
- tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
-
- tcp::socket socket(io_service);
- acceptor.accept(socket);
-
- string remote_address = socket.remote_endpoint().address().to_string();
- printf("Connected to remote client at: %s\n", remote_address.c_str());
-
- DeviceServer server(this, socket);
- server.listen();
-
- printf("Disconnected.\n");
- }
- }
- catch (exception &e) {
- fprintf(stderr, "Network server exception: %s\n", e.what());
- }
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
deleted file mode 100644
index b3a0f6daa57..00000000000
--- a/intern/cycles/device/device_network.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_NETWORK_H__
-#define __DEVICE_NETWORK_H__
-
-#ifdef WITH_NETWORK
-
-# include <boost/archive/binary_iarchive.hpp>
-# include <boost/archive/binary_oarchive.hpp>
-# include <boost/archive/text_iarchive.hpp>
-# include <boost/archive/text_oarchive.hpp>
-# include <boost/array.hpp>
-# include <boost/asio.hpp>
-# include <boost/bind.hpp>
-# include <boost/serialization/vector.hpp>
-# include <boost/thread.hpp>
-
-# include <deque>
-# include <iostream>
-# include <sstream>
-
-# include "render/buffers.h"
-
-# include "util/util_foreach.h"
-# include "util/util_list.h"
-# include "util/util_map.h"
-# include "util/util_param.h"
-# include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-using std::cerr;
-using std::cout;
-using std::exception;
-using std::hex;
-using std::setw;
-
-using boost::asio::ip::tcp;
-
-static const int SERVER_PORT = 5120;
-static const int DISCOVER_PORT = 5121;
-static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP";
-static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP";
-
-# if 0
-typedef boost::archive::text_oarchive o_archive;
-typedef boost::archive::text_iarchive i_archive;
-# else
-typedef boost::archive::binary_oarchive o_archive;
-typedef boost::archive::binary_iarchive i_archive;
-# endif
-
-/* Serialization of device memory */
-
-class network_device_memory : public device_memory {
- public:
- network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY)
- {
- }
-
- ~network_device_memory()
- {
- device_pointer = 0;
- };
-
- vector<char> local_data;
-};
-
-/* Common network error function / object for both DeviceNetwork and DeviceServer. */
-class NetworkError {
- public:
- NetworkError()
- {
- error = "";
- error_count = 0;
- }
-
- ~NetworkError()
- {
- }
-
- void network_error(const string &message)
- {
- error = message;
- error_count += 1;
- }
-
- bool have_error()
- {
- return true ? error_count > 0 : false;
- }
-
- private:
- string error;
- int error_count;
-};
-
-/* Remote procedure call Send */
-
-class RPCSend {
- public:
- RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "")
- : name(name_), socket(socket_), archive(archive_stream), sent(false)
- {
- archive &name_;
- error_func = e;
- fprintf(stderr, "rpc send %s\n", name.c_str());
- }
-
- ~RPCSend()
- {
- }
-
- void add(const device_memory &mem)
- {
- archive &mem.data_type &mem.data_elements &mem.data_size;
- archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
- archive &mem.type &string(mem.name);
- archive &mem.interpolation &mem.extension;
- archive &mem.device_pointer;
- }
-
- template<typename T> void add(const T &data)
- {
- archive &data;
- }
-
- void add(const DeviceTask &task)
- {
- int type = (int)task.type;
- archive &type &task.x &task.y &task.w &task.h;
- archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
- archive &task.offset &task.stride;
- archive &task.shader_input &task.shader_output &task.shader_eval_type;
- archive &task.shader_x &task.shader_w;
- archive &task.need_finish_queue;
- }
-
- void add(const RenderTile &tile)
- {
- archive &tile.x &tile.y &tile.w &tile.h;
- archive &tile.start_sample &tile.num_samples &tile.sample;
- archive &tile.resolution &tile.offset &tile.stride;
- archive &tile.buffer;
- }
-
- void write()
- {
- boost::system::error_code error;
-
- /* get string from stream */
- string archive_str = archive_stream.str();
-
- /* first send fixed size header with size of following data */
- ostringstream header_stream;
- header_stream << setw(8) << hex << archive_str.size();
- string header_str = header_stream.str();
-
- boost::asio::write(
- socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error);
-
- if (error.value())
- error_func->network_error(error.message());
-
- /* then send actual data */
- boost::asio::write(
- socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error);
-
- if (error.value())
- error_func->network_error(error.message());
-
- sent = true;
- }
-
- void write_buffer(void *buffer, size_t size)
- {
- boost::system::error_code error;
-
- boost::asio::write(
- socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error);
-
- if (error.value())
- error_func->network_error(error.message());
- }
-
- protected:
- string name;
- tcp::socket &socket;
- ostringstream archive_stream;
- o_archive archive;
- bool sent;
- NetworkError *error_func;
-};
-
-/* Remote procedure call Receive */
-
-class RPCReceive {
- public:
- RPCReceive(tcp::socket &socket_, NetworkError *e)
- : socket(socket_), archive_stream(NULL), archive(NULL)
- {
- error_func = e;
- /* read head with fixed size */
- vector<char> header(8);
- boost::system::error_code error;
- size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
-
- if (error.value()) {
- error_func->network_error(error.message());
- }
-
- /* verify if we got something */
- if (len == header.size()) {
- /* decode header */
- string header_str(&header[0], header.size());
- istringstream header_stream(header_str);
-
- size_t data_size;
-
- if ((header_stream >> hex >> data_size)) {
-
- vector<char> data(data_size);
- size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
-
- if (error.value())
- error_func->network_error(error.message());
-
- if (len == data_size) {
- archive_str = (data.size()) ? string(&data[0], data.size()) : string("");
-
- archive_stream = new istringstream(archive_str);
- archive = new i_archive(*archive_stream);
-
- *archive &name;
- fprintf(stderr, "rpc receive %s\n", name.c_str());
- }
- else {
- error_func->network_error("Network receive error: data size doesn't match header");
- }
- }
- else {
- error_func->network_error("Network receive error: can't decode data size from header");
- }
- }
- else {
- error_func->network_error("Network receive error: invalid header size");
- }
- }
-
- ~RPCReceive()
- {
- delete archive;
- delete archive_stream;
- }
-
- void read(network_device_memory &mem, string &name)
- {
- *archive &mem.data_type &mem.data_elements &mem.data_size;
- *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
- *archive &mem.type &name;
- *archive &mem.interpolation &mem.extension;
- *archive &mem.device_pointer;
-
- mem.name = name.c_str();
- mem.host_pointer = 0;
-
- /* Can't transfer OpenGL texture over network. */
- if (mem.type == MEM_PIXELS) {
- mem.type = MEM_READ_WRITE;
- }
- }
-
- template<typename T> void read(T &data)
- {
- *archive &data;
- }
-
- void read_buffer(void *buffer, size_t size)
- {
- boost::system::error_code error;
- size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
-
- if (error.value()) {
- error_func->network_error(error.message());
- }
-
- if (len != size)
- cout << "Network receive error: buffer size doesn't match expected size\n";
- }
-
- void read(DeviceTask &task)
- {
- int type;
-
- *archive &type &task.x &task.y &task.w &task.h;
- *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
- *archive &task.offset &task.stride;
- *archive &task.shader_input &task.shader_output &task.shader_eval_type;
- *archive &task.shader_x &task.shader_w;
- *archive &task.need_finish_queue;
-
- task.type = (DeviceTask::Type)type;
- }
-
- void read(RenderTile &tile)
- {
- *archive &tile.x &tile.y &tile.w &tile.h;
- *archive &tile.start_sample &tile.num_samples &tile.sample;
- *archive &tile.resolution &tile.offset &tile.stride;
- *archive &tile.buffer;
-
- tile.buffers = NULL;
- }
-
- string name;
-
- protected:
- tcp::socket &socket;
- string archive_str;
- istringstream *archive_stream;
- i_archive *archive;
- NetworkError *error_func;
-};
-
-/* Server auto discovery */
-
-class ServerDiscovery {
- public:
- explicit ServerDiscovery(bool discover = false)
- : listen_socket(io_service), collect_servers(false)
- {
- /* setup listen socket */
- listen_endpoint.address(boost::asio::ip::address_v4::any());
- listen_endpoint.port(DISCOVER_PORT);
-
- listen_socket.open(listen_endpoint.protocol());
-
- boost::asio::socket_base::reuse_address option(true);
- listen_socket.set_option(option);
-
- listen_socket.bind(listen_endpoint);
-
- /* setup receive callback */
- async_receive();
-
- /* start server discovery */
- if (discover) {
- collect_servers = true;
- servers.clear();
-
- broadcast_message(DISCOVER_REQUEST_MSG);
- }
-
- /* start thread */
- work = new boost::asio::io_service::work(io_service);
- thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
- }
-
- ~ServerDiscovery()
- {
- io_service.stop();
- thread->join();
- delete thread;
- delete work;
- }
-
- vector<string> get_server_list()
- {
- vector<string> result;
-
- mutex.lock();
- result = vector<string>(servers.begin(), servers.end());
- mutex.unlock();
-
- return result;
- }
-
- private:
- void handle_receive_from(const boost::system::error_code &error, size_t size)
- {
- if (error) {
- cout << "Server discovery receive error: " << error.message() << "\n";
- return;
- }
-
- if (size > 0) {
- string msg = string(receive_buffer, size);
-
- /* handle incoming message */
- if (collect_servers) {
- if (msg == DISCOVER_REPLY_MSG) {
- string address = receive_endpoint.address().to_string();
-
- mutex.lock();
-
- /* add address if it's not already in the list */
- bool found = std::find(servers.begin(), servers.end(), address) != servers.end();
-
- if (!found)
- servers.push_back(address);
-
- mutex.unlock();
- }
- }
- else {
- /* reply to request */
- if (msg == DISCOVER_REQUEST_MSG)
- broadcast_message(DISCOVER_REPLY_MSG);
- }
- }
-
- async_receive();
- }
-
- void async_receive()
- {
- listen_socket.async_receive_from(boost::asio::buffer(receive_buffer),
- receive_endpoint,
- boost::bind(&ServerDiscovery::handle_receive_from,
- this,
- boost::asio::placeholders::error,
- boost::asio::placeholders::bytes_transferred));
- }
-
- void broadcast_message(const string &msg)
- {
- /* setup broadcast socket */
- boost::asio::ip::udp::socket socket(io_service);
-
- socket.open(boost::asio::ip::udp::v4());
-
- boost::asio::socket_base::broadcast option(true);
- socket.set_option(option);
-
- boost::asio::ip::udp::endpoint broadcast_endpoint(
- boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
-
- /* broadcast message */
- socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
- }
-
- /* network service and socket */
- boost::asio::io_service io_service;
- boost::asio::ip::udp::endpoint listen_endpoint;
- boost::asio::ip::udp::socket listen_socket;
-
- /* threading */
- boost::thread *thread;
- boost::asio::io_service::work *work;
- boost::mutex mutex;
-
- /* buffer and endpoint for receiving messages */
- char receive_buffer[256];
- boost::asio::ip::udp::endpoint receive_endpoint;
-
- // os, version, devices, status, host name, group name, ip as far as fields go
- struct ServerInfo {
- string cycles_version;
- string os;
- int device_count;
- string status;
- string host_name;
- string group_name;
- string host_addr;
- };
-
- /* collection of server addresses in list */
- bool collect_servers;
- vector<string> servers;
-};
-
-CCL_NAMESPACE_END
-
-#endif
-
-#endif /* __DEVICE_NETWORK_H__ */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
deleted file mode 100644
index 9abb7cfb7fe..00000000000
--- a/intern/cycles/device/device_opencl.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-# include "device/opencl/device_opencl.h"
-# include "device/device.h"
-# include "device/device_intern.h"
-
-# include "util/util_foreach.h"
-# include "util/util_logging.h"
-# include "util/util_set.h"
-# include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
- return opencl_create_split_device(info, stats, profiler, background);
-}
-
-bool device_opencl_init()
-{
- static bool initialized = false;
- static bool result = false;
-
- if (initialized)
- return result;
-
- initialized = true;
-
- if (OpenCLInfo::device_type() != 0) {
- int clew_result = clewInit();
- if (clew_result == CLEW_SUCCESS) {
- VLOG(1) << "CLEW initialization succeeded.";
- result = true;
- }
- else {
- VLOG(1) << "CLEW initialization failed: "
- << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
- "Error opening the library");
- }
- }
- else {
- VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
- result = false;
- }
-
- return result;
-}
-
-static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
-{
-# ifdef _WIN32
- __try {
- return clGetPlatformIDs(0, NULL, num_platforms);
- }
- __except (EXCEPTION_EXECUTE_HANDLER) {
- /* Ignore crashes inside the OpenCL driver and hope we can
- * survive even with corrupted OpenCL installs. */
- fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
- }
-
- *num_platforms = 0;
- return CL_DEVICE_NOT_FOUND;
-# else
- return clGetPlatformIDs(0, NULL, num_platforms);
-# endif
-}
-
-void device_opencl_info(vector<DeviceInfo> &devices)
-{
- cl_uint num_platforms = 0;
- device_opencl_get_num_platforms_safe(&num_platforms);
- if (num_platforms == 0) {
- return;
- }
-
- vector<OpenCLPlatformDevice> usable_devices;
- OpenCLInfo::get_usable_devices(&usable_devices);
- /* Devices are numbered consecutively across platforms. */
- int num_devices = 0;
- set<string> unique_ids;
- foreach (OpenCLPlatformDevice &platform_device, usable_devices) {
- /* Compute unique ID for persistent user preferences. */
- const string &platform_name = platform_device.platform_name;
- const string &device_name = platform_device.device_name;
- string hardware_id = platform_device.hardware_id;
- if (hardware_id == "") {
- hardware_id = string_printf("ID_%d", num_devices);
- }
- string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
-
- /* Hardware ID might not be unique, add device number in that case. */
- if (unique_ids.find(id) != unique_ids.end()) {
- id += string_printf("_ID_%d", num_devices);
- }
- unique_ids.insert(id);
-
- /* Create DeviceInfo. */
- DeviceInfo info;
- info.type = DEVICE_OPENCL;
- info.description = string_remove_trademark(string(device_name));
- info.num = num_devices;
- /* We don't know if it's used for display, but assume it is. */
- info.display_device = true;
- info.use_split_kernel = true;
- info.has_volume_decoupled = false;
- info.has_adaptive_stop_per_sample = false;
- info.denoisers = DENOISER_NLM;
- info.id = id;
-
- /* Check OpenCL extensions */
- info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
-
- /* Disabled for now due to apparent AMD driver bug. */
- info.has_nanovdb = platform_name != "AMD Accelerated Parallel Processing";
-
- devices.push_back(info);
- num_devices++;
- }
-}
-
-string device_opencl_capabilities()
-{
- if (OpenCLInfo::device_type() == 0) {
- return "All OpenCL devices are forced to be OFF";
- }
- string result = "";
- string error_msg = ""; /* Only used by opencl_assert(), but in the future
- * it could also be nicely reported to the console.
- */
- cl_uint num_platforms = 0;
- opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
- if (num_platforms == 0) {
- return "No OpenCL platforms found\n";
- }
- result += string_printf("Number of platforms: %u\n", num_platforms);
-
- vector<cl_platform_id> platform_ids;
- platform_ids.resize(num_platforms);
- opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
-
-# define APPEND_INFO(func, id, name, what, type) \
- do { \
- type data; \
- memset(&data, 0, sizeof(data)); \
- opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
- result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
- } while (false)
-# define APPEND_STRING_INFO_IMPL(func, id, name, what, is_optional) \
- do { \
- string value; \
- size_t length = 0; \
- if (func(id, what, 0, NULL, &length) == CL_SUCCESS) { \
- vector<char> buffer(length + 1); \
- if (func(id, what, buffer.size(), buffer.data(), NULL) == CL_SUCCESS) { \
- value = string(buffer.data()); \
- } \
- } \
- if (is_optional && !(length != 0 && value[0] != '\0')) { \
- break; \
- } \
- result += string_printf("%s: %s\n", name, value.c_str()); \
- } while (false)
-# define APPEND_PLATFORM_STRING_INFO(id, name, what) \
- APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, false)
-# define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
- APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, true)
-# define APPEND_PLATFORM_INFO(id, name, what, type) \
- APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
-# define APPEND_DEVICE_INFO(id, name, what, type) \
- APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
-# define APPEND_DEVICE_STRING_INFO(id, name, what) \
- APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, false)
-# define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
- APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, true)
-
- vector<cl_device_id> device_ids;
- for (cl_uint platform = 0; platform < num_platforms; ++platform) {
- cl_platform_id platform_id = platform_ids[platform];
-
- result += string_printf("Platform #%u\n", platform);
-
- APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME);
- APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR);
- APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION);
- APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE);
- APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS);
-
- cl_uint num_devices = 0;
- opencl_assert(
- clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices));
- result += string_printf("\tNumber of devices: %u\n", num_devices);
-
- device_ids.resize(num_devices);
- opencl_assert(clGetDeviceIDs(
- platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL));
- for (cl_uint device = 0; device < num_devices; ++device) {
- cl_device_id device_id = device_ids[device];
-
- result += string_printf("\t\tDevice: #%u\n", device);
-
- APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
- APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
- APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
- APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
- APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
- APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION);
- APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS);
- APPEND_DEVICE_INFO(
- device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
- APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
- APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
- }
- }
-
-# undef APPEND_INFO
-# undef APPEND_STRING_INFO_IMPL
-# undef APPEND_PLATFORM_STRING_INFO
-# undef APPEND_STRING_EXTENSION_INFO
-# undef APPEND_PLATFORM_INFO
-# undef APPEND_DEVICE_INFO
-# undef APPEND_DEVICE_STRING_INFO
-# undef APPEND_DEVICE_STRING_EXTENSION_INFO
-
- return result;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
deleted file mode 100644
index 6f9a7943722..00000000000
--- a/intern/cycles/device/device_optix.cpp
+++ /dev/null
@@ -1,1936 +0,0 @@
-/*
- * Copyright 2019, NVIDIA Corporation.
- * Copyright 2019, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPTIX
-
-# include "bvh/bvh.h"
-# include "bvh/bvh_optix.h"
-# include "device/cuda/device_cuda.h"
-# include "device/device_denoising.h"
-# include "device/device_intern.h"
-# include "render/buffers.h"
-# include "render/hair.h"
-# include "render/mesh.h"
-# include "render/object.h"
-# include "render/scene.h"
-# include "util/util_debug.h"
-# include "util/util_logging.h"
-# include "util/util_md5.h"
-# include "util/util_path.h"
-# include "util/util_progress.h"
-# include "util/util_time.h"
-
-# ifdef WITH_CUDA_DYNLOAD
-# include <cuew.h>
-// Do not use CUDA SDK headers when using CUEW
-# define OPTIX_DONT_INCLUDE_CUDA
-# endif
-# include <optix_function_table_definition.h>
-# include <optix_stubs.h>
-
-// TODO(pmours): Disable this once drivers have native support
-# define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
-
-CCL_NAMESPACE_BEGIN
-
-/* Make sure this stays in sync with kernel_globals.h */
-struct ShaderParams {
- uint4 *input;
- float4 *output;
- int type;
- int filter;
- int sx;
- int offset;
- int sample;
-};
-struct KernelParams {
- WorkTile tile;
- KernelData data;
- ShaderParams shader;
-# define KERNEL_TEX(type, name) const type *name;
-# include "kernel/kernel_textures.h"
-# undef KERNEL_TEX
-};
-
-# define check_result_cuda(stmt) \
- { \
- CUresult res = stmt; \
- if (res != CUDA_SUCCESS) { \
- const char *name; \
- cuGetErrorName(res, &name); \
- set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
- return; \
- } \
- } \
- (void)0
-# define check_result_cuda_ret(stmt) \
- { \
- CUresult res = stmt; \
- if (res != CUDA_SUCCESS) { \
- const char *name; \
- cuGetErrorName(res, &name); \
- set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
- return false; \
- } \
- } \
- (void)0
-
-# define check_result_optix(stmt) \
- { \
- enum OptixResult res = stmt; \
- if (res != OPTIX_SUCCESS) { \
- const char *name = optixGetErrorName(res); \
- set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
- return; \
- } \
- } \
- (void)0
-# define check_result_optix_ret(stmt) \
- { \
- enum OptixResult res = stmt; \
- if (res != OPTIX_SUCCESS) { \
- const char *name = optixGetErrorName(res); \
- set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
- return false; \
- } \
- } \
- (void)0
-
-# define launch_filter_kernel(func_name, w, h, args) \
- { \
- CUfunction func; \
- check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
- check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
- int threads; \
- check_result_cuda_ret( \
- cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
- threads = (int)sqrt((float)threads); \
- int xblocks = ((w) + threads - 1) / threads; \
- int yblocks = ((h) + threads - 1) / threads; \
- check_result_cuda_ret( \
- cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
- } \
- (void)0
-
-class OptiXDevice : public CUDADevice {
-
- // List of OptiX program groups
- enum {
- PG_RGEN,
- PG_MISS,
- PG_HITD, // Default hit group
- PG_HITS, // __SHADOW_RECORD_ALL__ hit group
- PG_HITL, // __BVH_LOCAL__ hit group (only used for triangles)
-# if OPTIX_ABI_VERSION >= 36
- PG_HITD_MOTION,
- PG_HITS_MOTION,
-# endif
- PG_BAKE, // kernel_bake_evaluate
- PG_DISP, // kernel_displace_evaluate
- PG_BACK, // kernel_background_evaluate
- PG_CALL,
- NUM_PROGRAM_GROUPS = PG_CALL + 3
- };
-
- // List of OptiX pipelines
- enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
-
- // A single shader binding table entry
- struct SbtRecord {
- char header[OPTIX_SBT_RECORD_HEADER_SIZE];
- };
-
- // Information stored about CUDA memory allocations
- struct CUDAMem {
- bool free_map_host = false;
- CUarray array = NULL;
- CUtexObject texobject = 0;
- bool use_mapped_host = false;
- };
-
- // Helper class to manage current CUDA context
- struct CUDAContextScope {
- CUDAContextScope(CUcontext ctx)
- {
- cuCtxPushCurrent(ctx);
- }
- ~CUDAContextScope()
- {
- cuCtxPopCurrent(NULL);
- }
- };
-
- // Use a pool with multiple threads to support launches with multiple CUDA streams
- TaskPool task_pool;
-
- vector<CUstream> cuda_stream;
- OptixDeviceContext context = NULL;
-
- OptixModule optix_module = NULL; // All necessary OptiX kernels are in one module
- OptixModule builtin_modules[2] = {};
- OptixPipeline pipelines[NUM_PIPELINES] = {};
-
- bool motion_blur = false;
- device_vector<SbtRecord> sbt_data;
- device_only_memory<KernelParams> launch_params;
- OptixTraversableHandle tlas_handle = 0;
-
- OptixDenoiser denoiser = NULL;
- device_only_memory<unsigned char> denoiser_state;
- int denoiser_input_passes = 0;
-
- vector<device_only_memory<char>> delayed_free_bvh_memory;
- thread_mutex delayed_free_bvh_mutex;
-
- public:
- OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
- : CUDADevice(info_, stats_, profiler_, background_),
- sbt_data(this, "__sbt", MEM_READ_ONLY),
- launch_params(this, "__params", false),
- denoiser_state(this, "__denoiser_state", true)
- {
- // Store number of CUDA streams in device info
- info.cpu_threads = DebugFlags().optix.cuda_streams;
-
- // Make the CUDA context current
- if (!cuContext) {
- return; // Do not initialize if CUDA context creation failed already
- }
- const CUDAContextScope scope(cuContext);
-
- // Create OptiX context for this device
- OptixDeviceContextOptions options = {};
-# ifdef WITH_CYCLES_LOGGING
- options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4
- options.logCallbackFunction =
- [](unsigned int level, const char *, const char *message, void *) {
- switch (level) {
- case 1:
- LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
- break;
- case 2:
- LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
- break;
- case 3:
- LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
- break;
- case 4:
- LOG_IF(INFO, VLOG_IS_ON(1)) << message;
- break;
- }
- };
-# endif
- check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
-# ifdef WITH_CYCLES_LOGGING
- check_result_optix(optixDeviceContextSetLogCallback(
- context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
-# endif
-
- // Create launch streams
- cuda_stream.resize(info.cpu_threads);
- for (int i = 0; i < info.cpu_threads; ++i)
- check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
-
- // Fix weird compiler bug that assigns wrong size
- launch_params.data_elements = sizeof(KernelParams);
- // Allocate launch parameter buffer memory on device
- launch_params.alloc_to_device(info.cpu_threads);
- }
- ~OptiXDevice()
- {
- // Stop processing any more tasks
- task_pool.cancel();
-
- // Make CUDA context current
- const CUDAContextScope scope(cuContext);
-
- free_bvh_memory_delayed();
-
- sbt_data.free();
- texture_info.free();
- launch_params.free();
- denoiser_state.free();
-
- // Unload modules
- if (optix_module != NULL)
- optixModuleDestroy(optix_module);
- for (unsigned int i = 0; i < 2; ++i)
- if (builtin_modules[i] != NULL)
- optixModuleDestroy(builtin_modules[i]);
- for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
- if (pipelines[i] != NULL)
- optixPipelineDestroy(pipelines[i]);
-
- // Destroy launch streams
- for (CUstream stream : cuda_stream)
- cuStreamDestroy(stream);
-
- if (denoiser != NULL)
- optixDenoiserDestroy(denoiser);
-
- optixDeviceContextDestroy(context);
- }
-
- private:
- bool show_samples() const override
- {
- // Only show samples if not rendering multiple tiles in parallel
- return info.cpu_threads == 1;
- }
-
- BVHLayoutMask get_bvh_layout_mask() const override
- {
- // CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
- if (optix_module == NULL)
- return CUDADevice::get_bvh_layout_mask();
-
- // OptiX has its own internal acceleration structure format
- return BVH_LAYOUT_OPTIX;
- }
-
- string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
- bool filter,
- bool /*split*/) override
- {
- // Split kernel is not supported in OptiX
- string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
- requested_features, filter, false);
-
- // Add OptiX SDK include directory to include paths
- const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
- if (optix_sdk_path) {
- common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
- }
-
- // Specialization for shader raytracing
- if (requested_features.use_shader_raytrace) {
- common_cflags += " --keep-device-functions";
- }
- else {
- common_cflags += " -D __NO_SHADER_RAYTRACE__";
- }
-
- return common_cflags;
- }
-
- bool load_kernels(const DeviceRequestedFeatures &requested_features) override
- {
- if (have_error()) {
- // Abort early if context creation failed already
- return false;
- }
-
- // Load CUDA modules because we need some of the utility kernels
- if (!CUDADevice::load_kernels(requested_features)) {
- return false;
- }
-
- // Baking is currently performed using CUDA, so no need to load OptiX kernels
- if (requested_features.use_baking) {
- return true;
- }
-
- const CUDAContextScope scope(cuContext);
-
- // Unload existing OptiX module and pipelines first
- if (optix_module != NULL) {
- optixModuleDestroy(optix_module);
- optix_module = NULL;
- }
- for (unsigned int i = 0; i < 2; ++i) {
- if (builtin_modules[i] != NULL) {
- optixModuleDestroy(builtin_modules[i]);
- builtin_modules[i] = NULL;
- }
- }
- for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
- if (pipelines[i] != NULL) {
- optixPipelineDestroy(pipelines[i]);
- pipelines[i] = NULL;
- }
- }
-
- OptixModuleCompileOptions module_options = {};
- module_options.maxRegisterCount = 0; // Do not set an explicit register limit
- module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
- module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-
-# if OPTIX_ABI_VERSION >= 41
- module_options.boundValues = nullptr;
- module_options.numBoundValues = 0;
-# endif
-
- OptixPipelineCompileOptions pipeline_options = {};
- // Default to no motion blur and two-level graph, since it is the fastest option
- pipeline_options.usesMotionBlur = false;
- pipeline_options.traversableGraphFlags =
- OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
- pipeline_options.numPayloadValues = 6;
- pipeline_options.numAttributeValues = 2; // u, v
- pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
- pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h
-
-# if OPTIX_ABI_VERSION >= 36
- pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
- if (requested_features.use_hair) {
- if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
- pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
- }
- else {
- pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
- }
- }
-# endif
-
- // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
- // This is necessary since objects may be reported to have motion if the Vector pass is
- // active, but may still need to be rendered without motion blur if that isn't active as well
- motion_blur = requested_features.use_object_motion;
-
- if (motion_blur) {
- pipeline_options.usesMotionBlur = true;
- // Motion blur can insert motion transforms into the traversal graph
- // It is no longer a two-level graph then, so need to set flags to allow any configuration
- pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
- }
-
- { // Load and compile PTX module with OptiX kernels
- string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
- "lib/kernel_optix_shader_raytrace.ptx" :
- "lib/kernel_optix.ptx");
- if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
- if (!getenv("OPTIX_ROOT_DIR")) {
- set_error(
- "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
- "the Optix SDK to be able to compile Optix kernels on demand).");
- return false;
- }
- ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
- }
- if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
- set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
- return false;
- }
-
- check_result_optix_ret(optixModuleCreateFromPTX(context,
- &module_options,
- &pipeline_options,
- ptx_data.data(),
- ptx_data.size(),
- nullptr,
- 0,
- &optix_module));
- }
-
- // Create program groups
- OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
- OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
- OptixProgramGroupOptions group_options = {}; // There are no options currently
- group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- group_descs[PG_RGEN].raygen.module = optix_module;
- // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
- group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
- group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
- group_descs[PG_MISS].miss.module = optix_module;
- group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
- group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
- group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
- group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
- group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
- group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
- group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
- group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
- group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
-
- if (requested_features.use_hair) {
- group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
- group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
-
- // Add curve intersection programs
- if (requested_features.use_hair_thick) {
- // Slower programs for thick hair since that also slows down ribbons.
- // Ideally this should not be needed.
- group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
- group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
- }
- else {
- group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
- group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
- }
-
-# if OPTIX_ABI_VERSION >= 36
- if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
- OptixBuiltinISOptions builtin_options = {};
- builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
- builtin_options.usesMotionBlur = false;
-
- check_result_optix_ret(optixBuiltinISModuleGet(
- context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
-
- group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
- group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
- group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
- group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
-
- if (motion_blur) {
- builtin_options.usesMotionBlur = true;
-
- check_result_optix_ret(optixBuiltinISModuleGet(
- context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
-
- group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
- group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
- group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
- group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
- }
- }
-# endif
- }
-
- if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
- // Add hit group for local intersections
- group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
- group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
- group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
- }
-
- if (requested_features.use_baking) {
- group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- group_descs[PG_BAKE].raygen.module = optix_module;
- group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
- }
-
- if (requested_features.use_true_displacement) {
- group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- group_descs[PG_DISP].raygen.module = optix_module;
- group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
- }
-
- if (requested_features.use_background_light) {
- group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- group_descs[PG_BACK].raygen.module = optix_module;
- group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
- }
-
- // Shader raytracing replaces some functions with direct callables
- if (requested_features.use_shader_raytrace) {
- group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
- group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
- group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
- group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
- group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
- group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
- "__direct_callable__kernel_volume_shadow";
- group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
- group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
- group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
- "__direct_callable__subsurface_scatter_multi_setup";
- }
-
- check_result_optix_ret(optixProgramGroupCreate(
- context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
-
- // Get program stack sizes
- OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
- // Set up SBT, which in this case is used only to select between different programs
- sbt_data.alloc(NUM_PROGRAM_GROUPS);
- memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
- for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
- check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
- check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
- }
- sbt_data.copy_to_device(); // Upload SBT to device
-
- // Calculate maximum trace continuation stack size
- unsigned int trace_css = stack_size[PG_HITD].cssCH;
- // This is based on the maximum of closest-hit and any-hit/intersection programs
- trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
- trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
- trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
-# if OPTIX_ABI_VERSION >= 36
- trace_css = std::max(trace_css,
- stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
- trace_css = std::max(trace_css,
- stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
-# endif
-
- OptixPipelineLinkOptions link_options = {};
- link_options.maxTraceDepth = 1;
- link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-# if OPTIX_ABI_VERSION < 24
- link_options.overrideUsesMotionBlur = motion_blur;
-# endif
-
- { // Create path tracing pipeline
- vector<OptixProgramGroup> pipeline_groups;
- pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
- pipeline_groups.push_back(groups[PG_RGEN]);
- pipeline_groups.push_back(groups[PG_MISS]);
- pipeline_groups.push_back(groups[PG_HITD]);
- pipeline_groups.push_back(groups[PG_HITS]);
- pipeline_groups.push_back(groups[PG_HITL]);
-# if OPTIX_ABI_VERSION >= 36
- if (motion_blur) {
- pipeline_groups.push_back(groups[PG_HITD_MOTION]);
- pipeline_groups.push_back(groups[PG_HITS_MOTION]);
- }
-# endif
- if (requested_features.use_shader_raytrace) {
- pipeline_groups.push_back(groups[PG_CALL + 0]);
- pipeline_groups.push_back(groups[PG_CALL + 1]);
- pipeline_groups.push_back(groups[PG_CALL + 2]);
- }
-
- check_result_optix_ret(optixPipelineCreate(context,
- &pipeline_options,
- &link_options,
- pipeline_groups.data(),
- pipeline_groups.size(),
- nullptr,
- 0,
- &pipelines[PIP_PATH_TRACE]));
-
- // Combine ray generation and trace continuation stack size
- const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
- // Max direct callable depth is one of the following, so combine accordingly
- // - __raygen__ -> svm_eval_nodes
- // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
- // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
- const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
- std::max(stack_size[PG_CALL + 1].dssDC,
- stack_size[PG_CALL + 2].dssDC);
-
- // Set stack size depending on pipeline options
- check_result_optix_ret(
- optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
- 0,
- requested_features.use_shader_raytrace ? dss : 0,
- css,
- motion_blur ? 3 : 2));
- }
-
- // Only need to create shader evaluation pipeline if one of these features is used:
- const bool use_shader_eval_pipeline = requested_features.use_baking ||
- requested_features.use_background_light ||
- requested_features.use_true_displacement;
-
- if (use_shader_eval_pipeline) { // Create shader evaluation pipeline
- vector<OptixProgramGroup> pipeline_groups;
- pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
- pipeline_groups.push_back(groups[PG_BAKE]);
- pipeline_groups.push_back(groups[PG_DISP]);
- pipeline_groups.push_back(groups[PG_BACK]);
- pipeline_groups.push_back(groups[PG_MISS]);
- pipeline_groups.push_back(groups[PG_HITD]);
- pipeline_groups.push_back(groups[PG_HITS]);
- pipeline_groups.push_back(groups[PG_HITL]);
-# if OPTIX_ABI_VERSION >= 36
- if (motion_blur) {
- pipeline_groups.push_back(groups[PG_HITD_MOTION]);
- pipeline_groups.push_back(groups[PG_HITS_MOTION]);
- }
-# endif
- if (requested_features.use_shader_raytrace) {
- pipeline_groups.push_back(groups[PG_CALL + 0]);
- pipeline_groups.push_back(groups[PG_CALL + 1]);
- pipeline_groups.push_back(groups[PG_CALL + 2]);
- }
-
- check_result_optix_ret(optixPipelineCreate(context,
- &pipeline_options,
- &link_options,
- pipeline_groups.data(),
- pipeline_groups.size(),
- nullptr,
- 0,
- &pipelines[PIP_SHADER_EVAL]));
-
- // Calculate continuation stack size based on the maximum of all ray generation stack sizes
- const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
- std::max(stack_size[PG_DISP].cssRG,
- stack_size[PG_BACK].cssRG)) +
- link_options.maxTraceDepth * trace_css;
- const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
- std::max(stack_size[PG_CALL + 1].dssDC,
- stack_size[PG_CALL + 2].dssDC);
-
- check_result_optix_ret(
- optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
- 0,
- requested_features.use_shader_raytrace ? dss : 0,
- css,
- motion_blur ? 3 : 2));
- }
-
- // Clean up program group objects
- for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
- optixProgramGroupDestroy(groups[i]);
- }
-
- return true;
- }
-
- void thread_run(DeviceTask &task, int thread_index) // Main task entry point
- {
- if (have_error())
- return; // Abort early if there was an error previously
-
- if (task.type == DeviceTask::RENDER) {
- if (thread_index != 0) {
- // Only execute denoising in a single thread (see also 'task_add')
- task.tile_types &= ~RenderTile::DENOISE;
- }
-
- RenderTile tile;
- while (task.acquire_tile(this, tile, task.tile_types)) {
- if (tile.task == RenderTile::PATH_TRACE)
- launch_render(task, tile, thread_index);
- else if (tile.task == RenderTile::BAKE) {
- // Perform baking using CUDA, since it is not currently implemented in OptiX
- device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
- CUDADevice::render(task, tile, work_tiles);
- }
- else if (tile.task == RenderTile::DENOISE)
- launch_denoise(task, tile);
- task.release_tile(tile);
- if (task.get_cancel() && !task.need_finish_queue)
- break; // User requested cancellation
- else if (have_error())
- break; // Abort rendering when encountering an error
- }
- }
- else if (task.type == DeviceTask::SHADER) {
- // CUDA kernels are used when doing baking
- if (optix_module == NULL)
- CUDADevice::shader(task);
- else
- launch_shader_eval(task, thread_index);
- }
- else if (task.type == DeviceTask::DENOISE_BUFFER) {
- // Set up a single tile that covers the whole task and denoise it
- RenderTile tile;
- tile.x = task.x;
- tile.y = task.y;
- tile.w = task.w;
- tile.h = task.h;
- tile.buffer = task.buffer;
- tile.num_samples = task.num_samples;
- tile.start_sample = task.sample;
- tile.offset = task.offset;
- tile.stride = task.stride;
- tile.buffers = task.buffers;
-
- launch_denoise(task, tile);
- }
- }
-
- void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
- {
- assert(thread_index < launch_params.data_size);
-
- // Keep track of total render time of this tile
- const scoped_timer timer(&rtile.buffers->render_time);
-
- WorkTile wtile;
- wtile.x = rtile.x;
- wtile.y = rtile.y;
- wtile.w = rtile.w;
- wtile.h = rtile.h;
- wtile.offset = rtile.offset;
- wtile.stride = rtile.stride;
- wtile.buffer = (float *)rtile.buffer;
-
- const int end_sample = rtile.start_sample + rtile.num_samples;
- // Keep this number reasonable to avoid running into TDRs
- int step_samples = (info.display_device ? 8 : 32);
-
- // Offset into launch params buffer so that streams use separate data
- device_ptr launch_params_ptr = launch_params.device_pointer +
- thread_index * launch_params.data_elements;
-
- const CUDAContextScope scope(cuContext);
-
- for (int sample = rtile.start_sample; sample < end_sample;) {
- // Copy work tile information to device
- wtile.start_sample = sample;
- wtile.num_samples = step_samples;
- if (task.adaptive_sampling.use) {
- wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
- }
- wtile.num_samples = min(wtile.num_samples, end_sample - sample);
- device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
- check_result_cuda(
- cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
-
- OptixShaderBindingTable sbt_params = {};
- sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
- sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
- sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
- sbt_params.missRecordCount = 1;
- sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
- sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-# if OPTIX_ABI_VERSION >= 36
- sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-# else
- sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
-# endif
- sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
- sbt_params.callablesRecordCount = 3;
- sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
- // Launch the ray generation program
- check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
- cuda_stream[thread_index],
- launch_params_ptr,
- launch_params.data_elements,
- &sbt_params,
- // Launch with samples close to each other for better locality
- wtile.w * wtile.num_samples,
- wtile.h,
- 1));
-
- // Run the adaptive sampling kernels at selected samples aligned to step samples.
- uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
- if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
- adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
- }
-
- // Wait for launch to finish
- check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
- // Update current sample, so it is displayed correctly
- sample += wtile.num_samples;
- rtile.sample = sample;
- // Update task progress after the kernel completed rendering
- task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
-
- if (task.get_cancel() && !task.need_finish_queue)
- return; // Cancel rendering
- }
-
- // Finalize adaptive sampling
- if (task.adaptive_sampling.use) {
- device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
- adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
- check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
- task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
- }
- }
-
- bool launch_denoise(DeviceTask &task, RenderTile &rtile)
- {
- // Update current sample (for display and NLM denoising task)
- rtile.sample = rtile.start_sample + rtile.num_samples;
-
- // Make CUDA context current now, since it is used for both denoising tasks
- const CUDAContextScope scope(cuContext);
-
- // Choose between OptiX and NLM denoising
- if (task.denoising.type == DENOISER_OPTIX) {
- // Map neighboring tiles onto this device, indices are as following:
- // Where index 4 is the center tile and index 9 is the target for the result.
- // 0 1 2
- // 3 4 5
- // 6 7 8 9
- RenderTileNeighbors neighbors(rtile);
- task.map_neighbor_tiles(neighbors, this);
- RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
- RenderTile &target_tile = neighbors.target;
- rtile = center_tile; // Tile may have been modified by mapping code
-
- // Calculate size of the tile to denoise (including overlap)
- int4 rect = center_tile.bounds();
- // Overlap between tiles has to be at least 64 pixels
- // TODO(pmours): Query this value from OptiX
- rect = rect_expand(rect, 64);
- int4 clip_rect = neighbors.bounds();
- rect = rect_clip(rect, clip_rect);
- int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
- int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
-
- // Calculate byte offsets and strides
- int pixel_stride = task.pass_stride * (int)sizeof(float);
- int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
- const int pass_offset[3] = {
- (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
- (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
- (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
-
- // Start with the current tile pointer offset
- int input_stride = pixel_stride;
- device_ptr input_ptr = rtile.buffer + pixel_offset;
-
- // Copy tile data into a common buffer if necessary
- device_only_memory<float> input(this, "denoiser input", true);
- device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
-
- bool contiguous_memory = true;
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
- contiguous_memory = false;
- }
- }
-
- if (contiguous_memory) {
- // Tiles are in continous memory, so can just subtract overlap offset
- input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
- // Stride covers the whole width of the image and not just a single tile
- input_stride *= rtile.stride;
- }
- else {
- // Adjacent tiles are in separate memory regions, so need to copy them into a single one
- input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
- // Start with the new input buffer
- input_ptr = input.device_pointer;
- // Stride covers the width of the new input buffer, which includes tile width and overlap
- input_stride *= rect_size.x;
-
- TileInfo *tile_info = tile_info_mem.alloc(1);
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- tile_info->offsets[i] = neighbors.tiles[i].offset;
- tile_info->strides[i] = neighbors.tiles[i].stride;
- tile_info->buffers[i] = neighbors.tiles[i].buffer;
- }
- tile_info->x[0] = neighbors.tiles[3].x;
- tile_info->x[1] = neighbors.tiles[4].x;
- tile_info->x[2] = neighbors.tiles[5].x;
- tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
- tile_info->y[0] = neighbors.tiles[1].y;
- tile_info->y[1] = neighbors.tiles[4].y;
- tile_info->y[2] = neighbors.tiles[7].y;
- tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
- tile_info_mem.copy_to_device();
-
- void *args[] = {
- &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
- launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
- }
-
-# if OPTIX_DENOISER_NO_PIXEL_STRIDE
- device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
- input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
-
- void *input_args[] = {&input_rgb.device_pointer,
- &input_ptr,
- &rect_size.x,
- &rect_size.y,
- &input_stride,
- &task.pass_stride,
- const_cast<int *>(pass_offset),
- &task.denoising.input_passes,
- &rtile.sample};
- launch_filter_kernel(
- "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
-
- input_ptr = input_rgb.device_pointer;
- pixel_stride = 3 * sizeof(float);
- input_stride = rect_size.x * pixel_stride;
-# endif
-
- const bool recreate_denoiser = (denoiser == NULL) ||
- (task.denoising.input_passes != denoiser_input_passes);
- if (recreate_denoiser) {
- // Destroy existing handle before creating new one
- if (denoiser != NULL) {
- optixDenoiserDestroy(denoiser);
- }
-
- // Create OptiX denoiser handle on demand when it is first used
- OptixDenoiserOptions denoiser_options = {};
- assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
-# if OPTIX_ABI_VERSION >= 47
- denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
- denoiser_options.guideNormal = task.denoising.input_passes >= 3;
- check_result_optix_ret(optixDenoiserCreate(
- context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
-# else
- denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
- OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
-# if OPTIX_ABI_VERSION < 28
- denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
-# endif
- check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
- check_result_optix_ret(
- optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
-# endif
-
- // OptiX denoiser handle was created with the requested number of input passes
- denoiser_input_passes = task.denoising.input_passes;
- }
-
- OptixDenoiserSizes sizes = {};
- check_result_optix_ret(
- optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
-
-# if OPTIX_ABI_VERSION < 28
- const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
-# else
- const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
-# endif
- const size_t scratch_offset = sizes.stateSizeInBytes;
-
- // Allocate denoiser state if tile size has changed since last setup
- if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
- denoiser_state.data_height != rect_size.y)) {
- denoiser_state.alloc_to_device(scratch_offset + scratch_size);
-
- // Initialize denoiser state for the current tile size
- check_result_optix_ret(optixDenoiserSetup(denoiser,
- 0,
- rect_size.x,
- rect_size.y,
- denoiser_state.device_pointer,
- scratch_offset,
- denoiser_state.device_pointer + scratch_offset,
- scratch_size));
-
- denoiser_state.data_width = rect_size.x;
- denoiser_state.data_height = rect_size.y;
- }
-
- // Set up input and output layer information
- OptixImage2D input_layers[3] = {};
- OptixImage2D output_layers[1] = {};
-
- for (int i = 0; i < 3; ++i) {
-# if OPTIX_DENOISER_NO_PIXEL_STRIDE
- input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
-# else
- input_layers[i].data = input_ptr + pass_offset[i];
-# endif
- input_layers[i].width = rect_size.x;
- input_layers[i].height = rect_size.y;
- input_layers[i].rowStrideInBytes = input_stride;
- input_layers[i].pixelStrideInBytes = pixel_stride;
- input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
- }
-
-# if OPTIX_DENOISER_NO_PIXEL_STRIDE
- output_layers[0].data = input_ptr;
- output_layers[0].width = rect_size.x;
- output_layers[0].height = rect_size.y;
- output_layers[0].rowStrideInBytes = input_stride;
- output_layers[0].pixelStrideInBytes = pixel_stride;
- int2 output_offset = overlap_offset;
- overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually
-# else
- output_layers[0].data = target_tile.buffer + pixel_offset;
- output_layers[0].width = target_tile.w;
- output_layers[0].height = target_tile.h;
- output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
- output_layers[0].pixelStrideInBytes = pixel_stride;
-# endif
- output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-
-# if OPTIX_ABI_VERSION >= 47
- OptixDenoiserLayer image_layers = {};
- image_layers.input = input_layers[0];
- image_layers.output = output_layers[0];
-
- OptixDenoiserGuideLayer guide_layers = {};
- guide_layers.albedo = input_layers[1];
- guide_layers.normal = input_layers[2];
-# endif
-
- // Finally run denonising
- OptixDenoiserParams params = {}; // All parameters are disabled/zero
-# if OPTIX_ABI_VERSION >= 47
- check_result_optix_ret(optixDenoiserInvoke(denoiser,
- NULL,
- &params,
- denoiser_state.device_pointer,
- scratch_offset,
- &guide_layers,
- &image_layers,
- 1,
- overlap_offset.x,
- overlap_offset.y,
- denoiser_state.device_pointer + scratch_offset,
- scratch_size));
-# else
- check_result_optix_ret(optixDenoiserInvoke(denoiser,
- NULL,
- &params,
- denoiser_state.device_pointer,
- scratch_offset,
- input_layers,
- task.denoising.input_passes,
- overlap_offset.x,
- overlap_offset.y,
- output_layers,
- denoiser_state.device_pointer + scratch_offset,
- scratch_size));
-# endif
-
-# if OPTIX_DENOISER_NO_PIXEL_STRIDE
- void *output_args[] = {&input_ptr,
- &target_tile.buffer,
- &output_offset.x,
- &output_offset.y,
- &rect_size.x,
- &rect_size.y,
- &target_tile.x,
- &target_tile.y,
- &target_tile.w,
- &target_tile.h,
- &target_tile.offset,
- &target_tile.stride,
- &task.pass_stride,
- &rtile.sample};
- launch_filter_kernel(
- "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
-# endif
-
- check_result_cuda_ret(cuStreamSynchronize(0));
-
- task.unmap_neighbor_tiles(neighbors, this);
- }
- else {
- // Run CUDA denoising kernels
- DenoisingTask denoising(this, task);
- CUDADevice::denoise(rtile, denoising);
- }
-
- // Update task progress after the denoiser completed processing
- task.update_progress(&rtile, rtile.w * rtile.h);
-
- return true;
- }
-
- void launch_shader_eval(DeviceTask &task, int thread_index)
- {
- unsigned int rgen_index = PG_BACK;
- if (task.shader_eval_type >= SHADER_EVAL_BAKE)
- rgen_index = PG_BAKE;
- if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
- rgen_index = PG_DISP;
-
- const CUDAContextScope scope(cuContext);
-
- device_ptr launch_params_ptr = launch_params.device_pointer +
- thread_index * launch_params.data_elements;
-
- for (int sample = 0; sample < task.num_samples; ++sample) {
- ShaderParams params;
- params.input = (uint4 *)task.shader_input;
- params.output = (float4 *)task.shader_output;
- params.type = task.shader_eval_type;
- params.filter = task.shader_filter;
- params.sx = task.shader_x;
- params.offset = task.offset;
- params.sample = sample;
-
- check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
- &params,
- sizeof(params),
- cuda_stream[thread_index]));
-
- OptixShaderBindingTable sbt_params = {};
- sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
- sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
- sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
- sbt_params.missRecordCount = 1;
- sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
- sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-# if OPTIX_ABI_VERSION >= 36
- sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-# else
- sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
-# endif
- sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
- sbt_params.callablesRecordCount = 3;
- sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
- check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
- cuda_stream[thread_index],
- launch_params_ptr,
- launch_params.data_elements,
- &sbt_params,
- task.shader_w,
- 1,
- 1));
-
- check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
- task.update_progress(NULL);
- }
- }
-
- bool build_optix_bvh(BVHOptiX *bvh,
- OptixBuildOperation operation,
- const OptixBuildInput &build_input,
- uint16_t num_motion_steps)
- {
- /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
- * from running out of memory (since both original and compacted acceleration structure memory
- * may be allocated at the same time for the duration of this function). The builds would
- * otherwise happen on the same CUDA stream anyway. */
- static thread_mutex mutex;
- thread_scoped_lock lock(mutex);
-
- const CUDAContextScope scope(cuContext);
-
- const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
- // Compute memory usage
- OptixAccelBufferSizes sizes = {};
- OptixAccelBuildOptions options = {};
- options.operation = operation;
- if (use_fast_trace_bvh) {
- VLOG(2) << "Using fast to trace OptiX BVH";
- options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
- }
- else {
- VLOG(2) << "Using fast to update OptiX BVH";
- options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
- }
-
- options.motionOptions.numKeys = num_motion_steps;
- options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
- options.motionOptions.timeBegin = 0.0f;
- options.motionOptions.timeEnd = 1.0f;
-
- check_result_optix_ret(
- optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
-
- // Allocate required output buffers
- device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
- temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
- if (!temp_mem.device_pointer)
- return false; // Make sure temporary memory allocation succeeded
-
- // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
- device_only_memory<char> &out_data = bvh->as_data;
- if (operation == OPTIX_BUILD_OPERATION_BUILD) {
- assert(out_data.device == this);
- out_data.alloc_to_device(sizes.outputSizeInBytes);
- if (!out_data.device_pointer)
- return false;
- }
- else {
- assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
- }
-
- // Finally build the acceleration structure
- OptixAccelEmitDesc compacted_size_prop = {};
- compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
- // A tiny space was allocated for this property at the end of the temporary buffer above
- // Make sure this pointer is 8-byte aligned
- compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
-
- OptixTraversableHandle out_handle = 0;
- check_result_optix_ret(optixAccelBuild(context,
- NULL,
- &options,
- &build_input,
- 1,
- temp_mem.device_pointer,
- sizes.tempSizeInBytes,
- out_data.device_pointer,
- sizes.outputSizeInBytes,
- &out_handle,
- use_fast_trace_bvh ? &compacted_size_prop : NULL,
- use_fast_trace_bvh ? 1 : 0));
- bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
- // Wait for all operations to finish
- check_result_cuda_ret(cuStreamSynchronize(NULL));
-
- // Compact acceleration structure to save memory (only if using fast trace as the
- // OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case).
- if (use_fast_trace_bvh) {
- uint64_t compacted_size = sizes.outputSizeInBytes;
- check_result_cuda_ret(
- cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
-
- // Temporary memory is no longer needed, so free it now to make space
- temp_mem.free();
-
- // There is no point compacting if the size does not change
- if (compacted_size < sizes.outputSizeInBytes) {
- device_only_memory<char> compacted_data(this, "optix compacted as", false);
- compacted_data.alloc_to_device(compacted_size);
- if (!compacted_data.device_pointer)
- // Do not compact if memory allocation for compacted acceleration structure fails
- // Can just use the uncompacted one then, so succeed here regardless
- return true;
-
- check_result_optix_ret(optixAccelCompact(context,
- NULL,
- out_handle,
- compacted_data.device_pointer,
- compacted_size,
- &out_handle));
- bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
- // Wait for compaction to finish
- check_result_cuda_ret(cuStreamSynchronize(NULL));
-
- std::swap(out_data.device_size, compacted_data.device_size);
- std::swap(out_data.device_pointer, compacted_data.device_pointer);
- // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
- }
- }
-
- return true;
- }
-
- void build_bvh(BVH *bvh, Progress &progress, bool refit) override
- {
- if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
- /* For baking CUDA is used, build appropriate BVH for that. */
- Device::build_bvh(bvh, progress, refit);
- return;
- }
-
- const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
- free_bvh_memory_delayed();
-
- BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
- progress.set_substatus("Building OptiX acceleration structure");
-
- if (!bvh->params.top_level) {
- assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
-
- OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
- /* Refit is only possible when using fast to trace BVH (because AS is built with
- * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
- if (refit && !use_fast_trace_bvh) {
- assert(bvh_optix->traversable_handle != 0);
- operation = OPTIX_BUILD_OPERATION_UPDATE;
- }
- else {
- bvh_optix->as_data.free();
- bvh_optix->traversable_handle = 0;
- }
-
- // Build bottom level acceleration structures (BLAS)
- Geometry *const geom = bvh->geometry[0];
- if (geom->geometry_type == Geometry::HAIR) {
- // Build BLAS for curve primitives
- Hair *const hair = static_cast<Hair *const>(geom);
- if (hair->num_curves() == 0) {
- return;
- }
-
- const size_t num_segments = hair->num_segments();
-
- size_t num_motion_steps = 1;
- Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
- if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
- num_motion_steps = hair->get_motion_steps();
- }
-
- device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
-# if OPTIX_ABI_VERSION >= 36
- device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
- device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
- // Four control points for each curve segment
- const size_t num_vertices = num_segments * 4;
- if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
- index_data.alloc(num_segments);
- vertex_data.alloc(num_vertices * num_motion_steps);
- }
- else
-# endif
- aabb_data.alloc(num_segments * num_motion_steps);
-
- // Get AABBs for each motion step
- for (size_t step = 0; step < num_motion_steps; ++step) {
- // The center step for motion vertices is not stored in the attribute
- const float3 *keys = hair->get_curve_keys().data();
- size_t center_step = (num_motion_steps - 1) / 2;
- if (step != center_step) {
- size_t attr_offset = (step > center_step) ? step - 1 : step;
- // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
- keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
- }
-
- for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
- const Hair::Curve curve = hair->get_curve(j);
-# if OPTIX_ABI_VERSION >= 36
- const array<float> &curve_radius = hair->get_curve_radius();
-# endif
-
- for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
-# if OPTIX_ABI_VERSION >= 36
- if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
- int k0 = curve.first_key + segment;
- int k1 = k0 + 1;
- int ka = max(k0 - 1, curve.first_key);
- int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
-
- const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
- const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
- const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
- const float4 pw = make_float4(
- curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
-
- // Convert Catmull-Rom data to Bezier spline
- static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
- static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
- static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
- static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
-
- index_data[i] = i * 4;
- float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
- v[0] = make_float4(
- dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
- v[1] = make_float4(
- dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
- v[2] = make_float4(
- dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
- v[3] = make_float4(
- dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
- }
- else
-# endif
- {
- BoundBox bounds = BoundBox::empty;
- curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
-
- const size_t index = step * num_segments + i;
- aabb_data[index].minX = bounds.min.x;
- aabb_data[index].minY = bounds.min.y;
- aabb_data[index].minZ = bounds.min.z;
- aabb_data[index].maxX = bounds.max.x;
- aabb_data[index].maxY = bounds.max.y;
- aabb_data[index].maxZ = bounds.max.z;
- }
- }
- }
- }
-
- // Upload AABB data to GPU
- aabb_data.copy_to_device();
-# if OPTIX_ABI_VERSION >= 36
- index_data.copy_to_device();
- vertex_data.copy_to_device();
-# endif
-
- vector<device_ptr> aabb_ptrs;
- aabb_ptrs.reserve(num_motion_steps);
-# if OPTIX_ABI_VERSION >= 36
- vector<device_ptr> width_ptrs;
- vector<device_ptr> vertex_ptrs;
- width_ptrs.reserve(num_motion_steps);
- vertex_ptrs.reserve(num_motion_steps);
-# endif
- for (size_t step = 0; step < num_motion_steps; ++step) {
- aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
-# if OPTIX_ABI_VERSION >= 36
- const device_ptr base_ptr = vertex_data.device_pointer +
- step * num_vertices * sizeof(float4);
- width_ptrs.push_back(base_ptr + 3 * sizeof(float)); // Offset by vertex size
- vertex_ptrs.push_back(base_ptr);
-# endif
- }
-
- // Force a single any-hit call, so shadow record-all behavior works correctly
- unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
- OptixBuildInput build_input = {};
-# if OPTIX_ABI_VERSION >= 36
- if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
- build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
- build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
- build_input.curveArray.numPrimitives = num_segments;
- build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
- build_input.curveArray.numVertices = num_vertices;
- build_input.curveArray.vertexStrideInBytes = sizeof(float4);
- build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
- build_input.curveArray.widthStrideInBytes = sizeof(float4);
- build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
- build_input.curveArray.indexStrideInBytes = sizeof(int);
- build_input.curveArray.flag = build_flags;
- build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
- }
- else
-# endif
- {
- // Disable visibility test any-hit program, since it is already checked during
- // intersection. Those trace calls that require anyhit can force it with a ray flag.
- build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
-
- build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-# if OPTIX_ABI_VERSION < 23
- build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
- build_input.aabbArray.numPrimitives = num_segments;
- build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
- build_input.aabbArray.flags = &build_flags;
- build_input.aabbArray.numSbtRecords = 1;
- build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
-# else
- build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
- build_input.customPrimitiveArray.numPrimitives = num_segments;
- build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
- build_input.customPrimitiveArray.flags = &build_flags;
- build_input.customPrimitiveArray.numSbtRecords = 1;
- build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
-# endif
- }
-
- if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
- progress.set_error("Failed to build OptiX acceleration structure");
- }
- }
- else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
- // Build BLAS for triangle primitives
- Mesh *const mesh = static_cast<Mesh *const>(geom);
- if (mesh->num_triangles() == 0) {
- return;
- }
-
- const size_t num_verts = mesh->get_verts().size();
-
- size_t num_motion_steps = 1;
- Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
- if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
- num_motion_steps = mesh->get_motion_steps();
- }
-
- device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
- index_data.alloc(mesh->get_triangles().size());
- memcpy(index_data.data(),
- mesh->get_triangles().data(),
- mesh->get_triangles().size() * sizeof(int));
- device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
- vertex_data.alloc(num_verts * num_motion_steps);
-
- for (size_t step = 0; step < num_motion_steps; ++step) {
- const float3 *verts = mesh->get_verts().data();
-
- size_t center_step = (num_motion_steps - 1) / 2;
- // The center step for motion vertices is not stored in the attribute
- if (step != center_step) {
- verts = motion_keys->data_float3() +
- (step > center_step ? step - 1 : step) * num_verts;
- }
-
- memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
- }
-
- // Upload triangle data to GPU
- index_data.copy_to_device();
- vertex_data.copy_to_device();
-
- vector<device_ptr> vertex_ptrs;
- vertex_ptrs.reserve(num_motion_steps);
- for (size_t step = 0; step < num_motion_steps; ++step) {
- vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
- }
-
- // Force a single any-hit call, so shadow record-all behavior works correctly
- unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
- OptixBuildInput build_input = {};
- build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
- build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
- build_input.triangleArray.numVertices = num_verts;
- build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
- build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
- build_input.triangleArray.indexBuffer = index_data.device_pointer;
- build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
- build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
- build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
- build_input.triangleArray.flags = &build_flags;
- // The SBT does not store per primitive data since Cycles already allocates separate
- // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
- // one and rely on that having the same meaning in this case.
- build_input.triangleArray.numSbtRecords = 1;
- build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
-
- if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
- progress.set_error("Failed to build OptiX acceleration structure");
- }
- }
- }
- else {
- unsigned int num_instances = 0;
- unsigned int max_num_instances = 0xFFFFFFFF;
-
- bvh_optix->as_data.free();
- bvh_optix->traversable_handle = 0;
- bvh_optix->motion_transform_data.free();
-
- optixDeviceContextGetProperty(context,
- OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
- &max_num_instances,
- sizeof(max_num_instances));
- // Do not count first bit, which is used to distinguish instanced and non-instanced objects
- max_num_instances >>= 1;
- if (bvh->objects.size() > max_num_instances) {
- progress.set_error(
- "Failed to build OptiX acceleration structure because there are too many instances");
- return;
- }
-
- // Fill instance descriptions
-# if OPTIX_ABI_VERSION < 41
- device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
- aabbs.alloc(bvh->objects.size());
-# endif
- device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
- instances.alloc(bvh->objects.size());
-
- // Calculate total motion transform size and allocate memory for them
- size_t motion_transform_offset = 0;
- if (motion_blur) {
- size_t total_motion_transform_size = 0;
- for (Object *const ob : bvh->objects) {
- if (ob->is_traceable() && ob->use_motion()) {
- total_motion_transform_size = align_up(total_motion_transform_size,
- OPTIX_TRANSFORM_BYTE_ALIGNMENT);
- const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
- total_motion_transform_size = total_motion_transform_size +
- sizeof(OptixSRTMotionTransform) +
- motion_keys * sizeof(OptixSRTData);
- }
- }
-
- assert(bvh_optix->motion_transform_data.device == this);
- bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
- }
-
- for (Object *ob : bvh->objects) {
- // Skip non-traceable objects
- if (!ob->is_traceable())
- continue;
-
- BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
- OptixTraversableHandle handle = blas->traversable_handle;
-
-# if OPTIX_ABI_VERSION < 41
- OptixAabb &aabb = aabbs[num_instances];
- aabb.minX = ob->bounds.min.x;
- aabb.minY = ob->bounds.min.y;
- aabb.minZ = ob->bounds.min.z;
- aabb.maxX = ob->bounds.max.x;
- aabb.maxY = ob->bounds.max.y;
- aabb.maxZ = ob->bounds.max.z;
-# endif
-
- OptixInstance &instance = instances[num_instances++];
- memset(&instance, 0, sizeof(instance));
-
- // Clear transform to identity matrix
- instance.transform[0] = 1.0f;
- instance.transform[5] = 1.0f;
- instance.transform[10] = 1.0f;
-
- // Set user instance ID to object index (but leave low bit blank)
- instance.instanceId = ob->get_device_index() << 1;
-
- // Have to have at least one bit in the mask, or else instance would always be culled
- instance.visibilityMask = 1;
-
- if (ob->get_geometry()->has_volume) {
- // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
- instance.visibilityMask |= 2;
- }
-
- if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
- // Same applies to curves (so they can be skipped in local trace calls)
- instance.visibilityMask |= 4;
-
-# if OPTIX_ABI_VERSION >= 36
- if (motion_blur && ob->get_geometry()->has_motion_blur() &&
- DebugFlags().optix.curves_api &&
- static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
- // Select between motion blur and non-motion blur built-in intersection module
- instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
- }
-# endif
- }
-
- // Insert motion traversable if object has motion
- if (motion_blur && ob->use_motion()) {
- size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
- size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
- motion_keys * sizeof(OptixSRTData);
-
- const CUDAContextScope scope(cuContext);
-
- motion_transform_offset = align_up(motion_transform_offset,
- OPTIX_TRANSFORM_BYTE_ALIGNMENT);
- CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
- motion_transform_offset;
- motion_transform_offset += motion_transform_size;
-
- // Allocate host side memory for motion transform and fill it with transform data
- OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
- new uint8_t[motion_transform_size]);
- motion_transform.child = handle;
- motion_transform.motionOptions.numKeys = ob->get_motion().size();
- motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
- motion_transform.motionOptions.timeBegin = 0.0f;
- motion_transform.motionOptions.timeEnd = 1.0f;
-
- OptixSRTData *const srt_data = motion_transform.srtData;
- array<DecomposedTransform> decomp(ob->get_motion().size());
- transform_motion_decompose(
- decomp.data(), ob->get_motion().data(), ob->get_motion().size());
-
- for (size_t i = 0; i < ob->get_motion().size(); ++i) {
- // Scale
- srt_data[i].sx = decomp[i].y.w; // scale.x.x
- srt_data[i].sy = decomp[i].z.w; // scale.y.y
- srt_data[i].sz = decomp[i].w.w; // scale.z.z
-
- // Shear
- srt_data[i].a = decomp[i].z.x; // scale.x.y
- srt_data[i].b = decomp[i].z.y; // scale.x.z
- srt_data[i].c = decomp[i].w.x; // scale.y.z
- assert(decomp[i].z.z == 0.0f); // scale.y.x
- assert(decomp[i].w.y == 0.0f); // scale.z.x
- assert(decomp[i].w.z == 0.0f); // scale.z.y
-
- // Pivot point
- srt_data[i].pvx = 0.0f;
- srt_data[i].pvy = 0.0f;
- srt_data[i].pvz = 0.0f;
-
- // Rotation
- srt_data[i].qx = decomp[i].x.x;
- srt_data[i].qy = decomp[i].x.y;
- srt_data[i].qz = decomp[i].x.z;
- srt_data[i].qw = decomp[i].x.w;
-
- // Translation
- srt_data[i].tx = decomp[i].y.x;
- srt_data[i].ty = decomp[i].y.y;
- srt_data[i].tz = decomp[i].y.z;
- }
-
- // Upload motion transform to GPU
- cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
- delete[] reinterpret_cast<uint8_t *>(&motion_transform);
-
- // Disable instance transform if object uses motion transform already
- instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-
- // Get traversable handle to motion transform
- optixConvertPointerToTraversableHandle(context,
- motion_transform_gpu,
- OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
- &instance.traversableHandle);
- }
- else {
- instance.traversableHandle = handle;
-
- if (ob->get_geometry()->is_instanced()) {
- // Set transform matrix
- memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
- }
- else {
- // Disable instance transform if geometry already has it applied to vertex data
- instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
- // Non-instanced objects read ID from 'prim_object', so distinguish
- // them from instanced objects with the low bit set
- instance.instanceId |= 1;
- }
- }
- }
-
- // Upload instance descriptions
-# if OPTIX_ABI_VERSION < 41
- aabbs.resize(num_instances);
- aabbs.copy_to_device();
-# endif
- instances.resize(num_instances);
- instances.copy_to_device();
-
- // Build top-level acceleration structure (TLAS)
- OptixBuildInput build_input = {};
- build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
-# if OPTIX_ABI_VERSION < 41 // Instance AABBs no longer need to be set since OptiX 7.2
- build_input.instanceArray.aabbs = aabbs.device_pointer;
- build_input.instanceArray.numAabbs = num_instances;
-# endif
- build_input.instanceArray.instances = instances.device_pointer;
- build_input.instanceArray.numInstances = num_instances;
-
- if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
- progress.set_error("Failed to build OptiX acceleration structure");
- }
- tlas_handle = bvh_optix->traversable_handle;
- }
- }
-
- void release_optix_bvh(BVH *bvh) override
- {
- thread_scoped_lock lock(delayed_free_bvh_mutex);
- /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
- * while GPU is still rendering. */
- BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
- delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
- delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
- bvh_optix->traversable_handle = 0;
- }
-
- void free_bvh_memory_delayed()
- {
- thread_scoped_lock lock(delayed_free_bvh_mutex);
- delayed_free_bvh_memory.free_memory();
- }
-
- void const_copy_to(const char *name, void *host, size_t size) override
- {
- // Set constant memory for CUDA module
- // TODO(pmours): This is only used for tonemapping (see 'film_convert').
- // Could be removed by moving those functions to filter CUDA module.
- CUDADevice::const_copy_to(name, host, size);
-
- if (strcmp(name, "__data") == 0) {
- assert(size <= sizeof(KernelData));
-
- // Update traversable handle (since it is different for each device on multi devices)
- KernelData *const data = (KernelData *)host;
- *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
-
- update_launch_params(offsetof(KernelParams, data), host, size);
- return;
- }
-
- // Update data storage pointers in launch parameters
-# define KERNEL_TEX(data_type, tex_name) \
- if (strcmp(name, #tex_name) == 0) { \
- update_launch_params(offsetof(KernelParams, tex_name), host, size); \
- return; \
- }
-# include "kernel/kernel_textures.h"
-# undef KERNEL_TEX
- }
-
- void update_launch_params(size_t offset, void *data, size_t data_size)
- {
- const CUDAContextScope scope(cuContext);
-
- for (int i = 0; i < info.cpu_threads; ++i)
- check_result_cuda(
- cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
- data,
- data_size));
- }
-
- void task_add(DeviceTask &task) override
- {
- // Upload texture information to device if it has changed since last launch
- load_texture_info();
-
- if (task.type == DeviceTask::FILM_CONVERT) {
- // Execute in main thread because of OpenGL access
- film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
- return;
- }
-
- if (task.type == DeviceTask::DENOISE_BUFFER) {
- // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
- task_pool.push([=] {
- DeviceTask task_copy = task;
- thread_run(task_copy, 0);
- });
- return;
- }
-
- // Split task into smaller ones
- list<DeviceTask> tasks;
- task.split(tasks, info.cpu_threads);
-
- // Queue tasks in internal task pool
- int task_index = 0;
- for (DeviceTask &task : tasks) {
- task_pool.push([=] {
- // Using task index parameter instead of thread index, since number of CUDA streams may
- // differ from number of threads
- DeviceTask task_copy = task;
- thread_run(task_copy, task_index);
- });
- task_index++;
- }
- }
-
- void task_wait() override
- {
- // Wait for all queued tasks to finish
- task_pool.wait_work();
- }
-
- void task_cancel() override
- {
- // Cancel any remaining tasks in the internal pool
- task_pool.cancel();
- }
-};
-
-bool device_optix_init()
-{
- if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
- return true; // Already initialized function table
-
- // Need to initialize CUDA as well
- if (!device_cuda_init())
- return false;
-
- const OptixResult result = optixInit();
-
- if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
- VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
- "Please update to the latest driver first!";
- return false;
- }
- else if (result != OPTIX_SUCCESS) {
- VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
- return false;
- }
-
- // Loaded OptiX successfully!
- return true;
-}
-
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
-{
- devices.reserve(cuda_devices.size());
-
- // Simply add all supported CUDA devices as OptiX devices again
- for (DeviceInfo info : cuda_devices) {
- assert(info.type == DEVICE_CUDA);
-
- int major;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
- if (major < 5) {
- continue; // Only Maxwell and up are supported by OptiX
- }
-
- info.type = DEVICE_OPTIX;
- info.id += "_OptiX";
- info.denoisers |= DENOISER_OPTIX;
- info.has_branched_path = false;
-
- devices.push_back(info);
- }
-}
-
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
- return new OptiXDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
new file mode 100644
index 00000000000..a89ba68d62c
--- /dev/null
+++ b/intern/cycles/device/device_queue.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_queue.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_time.h"
+
+#include <iomanip>
+
+CCL_NAMESPACE_BEGIN
+
+DeviceQueue::DeviceQueue(Device *device)
+ : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
+{
+ DCHECK_NE(device, nullptr);
+}
+
+DeviceQueue::~DeviceQueue()
+{
+ if (VLOG_IS_ON(3)) {
+ /* Print kernel execution times sorted by time. */
+ vector<pair<DeviceKernelMask, double>> stats_sorted;
+ for (const auto &stat : stats_kernel_time_) {
+ stats_sorted.push_back(stat);
+ }
+
+ sort(stats_sorted.begin(),
+ stats_sorted.end(),
+ [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) {
+ return a.second > b.second;
+ });
+
+ VLOG(3) << "GPU queue stats:";
+ for (const auto &[mask, time] : stats_sorted) {
+ VLOG(3) << " " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5)
+ << std::right << time << "s: " << device_kernel_mask_as_string(mask);
+ }
+ }
+}
+
+void DeviceQueue::debug_init_execution()
+{
+ if (VLOG_IS_ON(3)) {
+ last_sync_time_ = time_dt();
+ last_kernels_enqueued_ = 0;
+ }
+}
+
+void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
+{
+ if (VLOG_IS_ON(3)) {
+ VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
+ << work_size;
+ last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
+ }
+}
+
+void DeviceQueue::debug_synchronize()
+{
+ if (VLOG_IS_ON(3)) {
+ const double new_time = time_dt();
+ const double elapsed_time = new_time - last_sync_time_;
+ VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
+
+ stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+
+ last_sync_time_ = new_time;
+ last_kernels_enqueued_ = 0;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
new file mode 100644
index 00000000000..edda3e61d51
--- /dev/null
+++ b/intern/cycles/device/device_queue.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_kernel.h"
+
+#include "device/device_graphics_interop.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class device_memory;
+
+struct KernelWorkTile;
+
+/* Abstraction of a command queue for a device.
+ * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
+ * from driver side.
+ *
+ * This class encapsulates all properties needed for commands execution. */
+class DeviceQueue {
+ public:
+ virtual ~DeviceQueue();
+
+ /* Number of concurrent states to process for integrator,
+ * based on number of cores and/or available memory. */
+ virtual int num_concurrent_states(const size_t state_size) const = 0;
+
+ /* Number of states which keeps the device occupied with work without loosing performance.
+ * The renderer will add more work (when available) when number of active paths falls below this
+ * value. */
+ virtual int num_concurrent_busy_states() const = 0;
+
+ /* Initialize execution of kernels on this queue.
+ *
+ * Will, for example, load all data required by the kernels from Device to global or path state.
+ *
+ * Use this method after device synchronization has finished before enqueueing any kernels. */
+ virtual void init_execution() = 0;
+
+ /* Test if an optional device kernel is available. */
+ virtual bool kernel_available(DeviceKernel kernel) const = 0;
+
+ /* Enqueue kernel execution.
+ *
+ * Execute the kernel work_size times on the device.
+ * Supported arguments types:
+ * - int: pass pointer to the int
+ * - device memory: pass pointer to device_memory.device_pointer
+ * Return false if there was an error executing this or a previous kernel. */
+ virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0;
+
+ /* Wait unit all enqueued kernels have finished execution.
+ * Return false if there was an error executing any of the enqueued kernels. */
+ virtual bool synchronize() = 0;
+
+ /* Copy memory to/from device as part of the command queue, to ensure
+ * operations are done in order without having to synchronize. */
+ virtual void zero_to_device(device_memory &mem) = 0;
+ virtual void copy_to_device(device_memory &mem) = 0;
+ virtual void copy_from_device(device_memory &mem) = 0;
+
+ /* Graphics resources interoperability.
+ *
+ * The interoperability comes here by the meaning that the device is capable of computing result
+ * directly into an OpenGL (or other graphics library) buffer. */
+
+ /* Create graphics interoperability context which will be taking care of mapping graphics
+ * resource as a buffer writable by kernels of this device. */
+ virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
+ {
+ LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
+ return nullptr;
+ }
+
+ /* Device this queue has been created for. */
+ Device *device;
+
+ protected:
+ /* Hide construction so that allocation via `Device` API is enforced. */
+ explicit DeviceQueue(Device *device);
+
+ /* Implementations call these from the corresponding methods to generate debugging logs. */
+ void debug_init_execution();
+ void debug_enqueue(DeviceKernel kernel, const int work_size);
+ void debug_synchronize();
+
+ /* Combination of kernels enqueued together sync last synchronize. */
+ DeviceKernelMask last_kernels_enqueued_;
+ /* Time of synchronize call. */
+ double last_sync_time_;
+ /* Accumulated execution time for combinations of kernels launched together. */
+ map<DeviceKernelMask, double> stats_kernel_time_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
deleted file mode 100644
index 9889f688aaa..00000000000
--- a/intern/cycles/device/device_split_kernel.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_split_kernel.h"
-
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "util/util_logging.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-static const double alpha = 0.1; /* alpha for rolling average */
-
-DeviceSplitKernel::DeviceSplitKernel(Device *device)
- : device(device),
- split_data(device, "split_data"),
- ray_state(device, "ray_state", MEM_READ_WRITE),
- queue_index(device, "queue_index"),
- use_queues_flag(device, "use_queues_flag"),
- work_pool_wgs(device, "work_pool_wgs"),
- kernel_data_initialized(false)
-{
- avg_time_per_sample = 0.0;
-
- kernel_path_init = NULL;
- kernel_scene_intersect = NULL;
- kernel_lamp_emission = NULL;
- kernel_do_volume = NULL;
- kernel_queue_enqueue = NULL;
- kernel_indirect_background = NULL;
- kernel_shader_setup = NULL;
- kernel_shader_sort = NULL;
- kernel_shader_eval = NULL;
- kernel_holdout_emission_blurring_pathtermination_ao = NULL;
- kernel_subsurface_scatter = NULL;
- kernel_direct_lighting = NULL;
- kernel_shadow_blocked_ao = NULL;
- kernel_shadow_blocked_dl = NULL;
- kernel_enqueue_inactive = NULL;
- kernel_next_iteration_setup = NULL;
- kernel_indirect_subsurface = NULL;
- kernel_buffer_update = NULL;
- kernel_adaptive_stopping = NULL;
- kernel_adaptive_filter_x = NULL;
- kernel_adaptive_filter_y = NULL;
- kernel_adaptive_adjust_samples = NULL;
-}
-
-DeviceSplitKernel::~DeviceSplitKernel()
-{
- split_data.free();
- ray_state.free();
- use_queues_flag.free();
- queue_index.free();
- work_pool_wgs.free();
-
- delete kernel_path_init;
- delete kernel_scene_intersect;
- delete kernel_lamp_emission;
- delete kernel_do_volume;
- delete kernel_queue_enqueue;
- delete kernel_indirect_background;
- delete kernel_shader_setup;
- delete kernel_shader_sort;
- delete kernel_shader_eval;
- delete kernel_holdout_emission_blurring_pathtermination_ao;
- delete kernel_subsurface_scatter;
- delete kernel_direct_lighting;
- delete kernel_shadow_blocked_ao;
- delete kernel_shadow_blocked_dl;
- delete kernel_enqueue_inactive;
- delete kernel_next_iteration_setup;
- delete kernel_indirect_subsurface;
- delete kernel_buffer_update;
- delete kernel_adaptive_stopping;
- delete kernel_adaptive_filter_x;
- delete kernel_adaptive_filter_y;
- delete kernel_adaptive_adjust_samples;
-}
-
-bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-#define LOAD_KERNEL(name) \
- kernel_##name = get_split_kernel_function(#name, requested_features); \
- if (!kernel_##name) { \
- device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
- return false; \
- }
-
- LOAD_KERNEL(path_init);
- LOAD_KERNEL(scene_intersect);
- LOAD_KERNEL(lamp_emission);
- if (requested_features.use_volume) {
- LOAD_KERNEL(do_volume);
- }
- LOAD_KERNEL(queue_enqueue);
- LOAD_KERNEL(indirect_background);
- LOAD_KERNEL(shader_setup);
- LOAD_KERNEL(shader_sort);
- LOAD_KERNEL(shader_eval);
- LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
- LOAD_KERNEL(subsurface_scatter);
- LOAD_KERNEL(direct_lighting);
- LOAD_KERNEL(shadow_blocked_ao);
- LOAD_KERNEL(shadow_blocked_dl);
- LOAD_KERNEL(enqueue_inactive);
- LOAD_KERNEL(next_iteration_setup);
- LOAD_KERNEL(indirect_subsurface);
- LOAD_KERNEL(buffer_update);
- LOAD_KERNEL(adaptive_stopping);
- LOAD_KERNEL(adaptive_filter_x);
- LOAD_KERNEL(adaptive_filter_y);
- LOAD_KERNEL(adaptive_adjust_samples);
-
-#undef LOAD_KERNEL
-
- /* Re-initialiaze kernel-dependent data when kernels change. */
- kernel_data_initialized = false;
-
- return true;
-}
-
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
- device_memory &data,
- uint64_t max_buffer_size)
-{
- uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
- VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element)
- << " bytes. (" << string_human_readable_size(size_per_element) << ").";
- return max_buffer_size / size_per_element;
-}
-
-bool DeviceSplitKernel::path_trace(DeviceTask &task,
- RenderTile &tile,
- device_memory &kgbuffer,
- device_memory &kernel_data)
-{
- if (device->have_error()) {
- return false;
- }
-
- /* Allocate all required global memory once. */
- if (!kernel_data_initialized) {
- kernel_data_initialized = true;
-
- /* Set local size */
- int2 lsize = split_kernel_local_size();
- local_size[0] = lsize[0];
- local_size[1] = lsize[1];
-
- /* Set global size */
- int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
-
- /* Make sure that set work size is a multiple of local
- * work size dimensions.
- */
- global_size[0] = round_up(gsize[0], local_size[0]);
- global_size[1] = round_up(gsize[1], local_size[1]);
-
- int num_global_elements = global_size[0] * global_size[1];
- assert(num_global_elements % WORK_POOL_SIZE == 0);
-
- /* Calculate max groups */
-
- /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
- unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU :
- WORK_POOL_SIZE_GPU;
- unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
-
- /* Allocate work_pool_wgs memory. */
- work_pool_wgs.alloc_to_device(max_work_groups);
- queue_index.alloc_to_device(NUM_QUEUES);
- use_queues_flag.alloc_to_device(1);
- split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
- ray_state.alloc(num_global_elements);
- }
-
- /* Number of elements in the global state buffer */
- int num_global_elements = global_size[0] * global_size[1];
-
-#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
- if (device->have_error()) { \
- return false; \
- } \
- if (!kernel_##name->enqueue( \
- KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
- return false; \
- }
-
- tile.sample = tile.start_sample;
-
- /* for exponential increase between tile updates */
- int time_multiplier = 1;
-
- while (tile.sample < tile.start_sample + tile.num_samples) {
- /* to keep track of how long it takes to run a number of samples */
- double start_time = time_dt();
-
- /* initial guess to start rolling average */
- const int initial_num_samples = 1;
- /* approx number of samples per second */
- const int samples_per_second = (avg_time_per_sample > 0.0) ?
- int(double(time_multiplier) / avg_time_per_sample) + 1 :
- initial_num_samples;
-
- RenderTile subtile = tile;
- subtile.start_sample = tile.sample;
- subtile.num_samples = samples_per_second;
-
- if (task.adaptive_sampling.use) {
- subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample,
- subtile.num_samples);
- }
-
- /* Don't go beyond requested number of samples. */
- subtile.num_samples = min(subtile.num_samples,
- tile.start_sample + tile.num_samples - tile.sample);
-
- if (device->have_error()) {
- return false;
- }
-
- /* reset state memory here as global size for data_init
- * kernel might not be large enough to do in kernel
- */
- work_pool_wgs.zero_to_device();
- split_data.zero_to_device();
- ray_state.zero_to_device();
-
- if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
- subtile,
- num_global_elements,
- kgbuffer,
- kernel_data,
- split_data,
- ray_state,
- queue_index,
- use_queues_flag,
- work_pool_wgs)) {
- return false;
- }
-
- ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
-
- bool activeRaysAvailable = true;
- double cancel_time = DBL_MAX;
-
- while (activeRaysAvailable) {
- /* Do path-iteration in host [Enqueue Path-iteration kernels. */
- for (int PathIter = 0; PathIter < 16; PathIter++) {
- ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
- if (kernel_do_volume) {
- ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
- }
- ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(
- holdout_emission_blurring_pathtermination_ao, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
-
- if (task.get_cancel() && cancel_time == DBL_MAX) {
- /* Wait up to twice as many seconds for current samples to finish
- * to avoid artifacts in render result from ending too soon.
- */
- cancel_time = time_dt() + 2.0 * time_multiplier;
- }
-
- if (time_dt() > cancel_time) {
- return true;
- }
- }
-
- /* Decide if we should exit path-iteration in host. */
- ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
-
- activeRaysAvailable = false;
-
- for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
- if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
- if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
- /* Something went wrong, abort to avoid looping endlessly. */
- device->set_error("Split kernel error: invalid ray state");
- return false;
- }
-
- /* Not all rays are RAY_INACTIVE. */
- activeRaysAvailable = true;
- break;
- }
- }
-
- if (time_dt() > cancel_time) {
- return true;
- }
- }
-
- int filter_sample = tile.sample + subtile.num_samples - 1;
- if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
- size_t buffer_size[2];
- buffer_size[0] = round_up(tile.w, local_size[0]);
- buffer_size[1] = round_up(tile.h, local_size[1]);
- kernel_adaptive_stopping->enqueue(
- KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
- buffer_size[0] = round_up(tile.h, local_size[0]);
- buffer_size[1] = round_up(1, local_size[1]);
- kernel_adaptive_filter_x->enqueue(
- KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
- buffer_size[0] = round_up(tile.w, local_size[0]);
- buffer_size[1] = round_up(1, local_size[1]);
- kernel_adaptive_filter_y->enqueue(
- KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
- }
-
- double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
-
- if (avg_time_per_sample == 0.0) {
- /* start rolling average */
- avg_time_per_sample = time_per_sample;
- }
- else {
- avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample;
- }
-
-#undef ENQUEUE_SPLIT_KERNEL
-
- tile.sample += subtile.num_samples;
- task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
-
- time_multiplier = min(time_multiplier << 1, 10);
-
- if (task.get_cancel()) {
- return true;
- }
- }
-
- if (task.adaptive_sampling.use) {
- /* Reset the start samples. */
- RenderTile subtile = tile;
- subtile.start_sample = tile.start_sample;
- subtile.num_samples = tile.sample - tile.start_sample;
- enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
- subtile,
- num_global_elements,
- kgbuffer,
- kernel_data,
- split_data,
- ray_state,
- queue_index,
- use_queues_flag,
- work_pool_wgs);
- size_t buffer_size[2];
- buffer_size[0] = round_up(tile.w, local_size[0]);
- buffer_size[1] = round_up(tile.h, local_size[1]);
- kernel_adaptive_adjust_samples->enqueue(
- KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
- }
-
- return true;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
deleted file mode 100644
index 07a21b10299..00000000000
--- a/intern/cycles/device/device_split_kernel.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_SPLIT_KERNEL_H__
-#define __DEVICE_SPLIT_KERNEL_H__
-
-#include "device/device.h"
-#include "render/buffers.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 // 5MB
-
-/* Types used for split kernel */
-
-class KernelDimensions {
- public:
- size_t global_size[2];
- size_t local_size[2];
-
- KernelDimensions(size_t global_size_[2], size_t local_size_[2])
- {
- memcpy(global_size, global_size_, sizeof(global_size));
- memcpy(local_size, local_size_, sizeof(local_size));
- }
-};
-
-class SplitKernelFunction {
- public:
- virtual ~SplitKernelFunction()
- {
- }
-
- /* enqueue the kernel, returns false if there is an error */
- virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0;
-};
-
-class DeviceSplitKernel {
- private:
- Device *device;
-
- SplitKernelFunction *kernel_path_init;
- SplitKernelFunction *kernel_scene_intersect;
- SplitKernelFunction *kernel_lamp_emission;
- SplitKernelFunction *kernel_do_volume;
- SplitKernelFunction *kernel_queue_enqueue;
- SplitKernelFunction *kernel_indirect_background;
- SplitKernelFunction *kernel_shader_setup;
- SplitKernelFunction *kernel_shader_sort;
- SplitKernelFunction *kernel_shader_eval;
- SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
- SplitKernelFunction *kernel_subsurface_scatter;
- SplitKernelFunction *kernel_direct_lighting;
- SplitKernelFunction *kernel_shadow_blocked_ao;
- SplitKernelFunction *kernel_shadow_blocked_dl;
- SplitKernelFunction *kernel_enqueue_inactive;
- SplitKernelFunction *kernel_next_iteration_setup;
- SplitKernelFunction *kernel_indirect_subsurface;
- SplitKernelFunction *kernel_buffer_update;
- SplitKernelFunction *kernel_adaptive_stopping;
- SplitKernelFunction *kernel_adaptive_filter_x;
- SplitKernelFunction *kernel_adaptive_filter_y;
- SplitKernelFunction *kernel_adaptive_adjust_samples;
-
- /* Global memory variables [porting]; These memory is used for
- * co-operation between different kernels; Data written by one
- * kernel will be available to another kernel via this global
- * memory.
- */
- device_only_memory<uchar> split_data;
- device_vector<uchar> ray_state;
- device_only_memory<int>
- queue_index; /* Array of size num_queues that tracks the size of each queue. */
-
- /* Flag to make sceneintersect and lampemission kernel use queues. */
- device_only_memory<char> use_queues_flag;
-
- /* Approximate time it takes to complete one sample */
- double avg_time_per_sample;
-
- /* Work pool with respect to each work group. */
- device_only_memory<unsigned int> work_pool_wgs;
-
- /* Cached kernel-dependent data, initialized once. */
- bool kernel_data_initialized;
- size_t local_size[2];
- size_t global_size[2];
-
- public:
- explicit DeviceSplitKernel(Device *device);
- virtual ~DeviceSplitKernel();
-
- bool load_kernels(const DeviceRequestedFeatures &requested_features);
- bool path_trace(DeviceTask &task,
- RenderTile &rtile,
- device_memory &kgbuffer,
- device_memory &kernel_data);
-
- virtual uint64_t state_buffer_size(device_memory &kg,
- device_memory &data,
- size_t num_threads) = 0;
- size_t max_elements_for_max_buffer_size(device_memory &kg,
- device_memory &data,
- uint64_t max_buffer_size);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory &kernel_globals,
- device_memory &kernel_data_,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flag,
- device_memory &work_pool_wgs) = 0;
-
- virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
- const DeviceRequestedFeatures &) = 0;
- virtual int2 split_kernel_local_size() = 0;
- virtual int2 split_kernel_global_size(device_memory &kg,
- device_memory &data,
- DeviceTask &task) = 0;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_SPLIT_KERNEL_H__ */
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
deleted file mode 100644
index 55fbaa31e42..00000000000
--- a/intern/cycles/device/device_task.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "device/device_task.h"
-
-#include "render/buffers.h"
-
-#include "util/util_algorithm.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-DeviceTask::DeviceTask(Type type_)
- : type(type_),
- x(0),
- y(0),
- w(0),
- h(0),
- rgba_byte(0),
- rgba_half(0),
- buffer(0),
- sample(0),
- num_samples(1),
- shader_input(0),
- shader_output(0),
- shader_eval_type(0),
- shader_filter(0),
- shader_x(0),
- shader_w(0),
- buffers(nullptr),
- tile_types(0),
- denoising_from_render(false),
- pass_stride(0),
- frame_stride(0),
- target_pass_stride(0),
- pass_denoising_data(0),
- pass_denoising_clean(0),
- need_finish_queue(false),
- integrator_branched(false)
-{
- last_update_time = time_dt();
-}
-
-int DeviceTask::get_subtask_count(int num, int max_size) const
-{
- if (max_size != 0) {
- int max_size_num;
-
- if (type == SHADER) {
- max_size_num = (shader_w + max_size - 1) / max_size;
- }
- else {
- max_size = max(1, max_size / w);
- max_size_num = (h + max_size - 1) / max_size;
- }
-
- num = max(max_size_num, num);
- }
-
- if (type == SHADER) {
- num = min(shader_w, num);
- }
- else if (type == RENDER) {
- }
- else {
- num = min(h, num);
- }
-
- return num;
-}
-
-void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const
-{
- num = get_subtask_count(num, max_size);
-
- if (type == SHADER) {
- for (int i = 0; i < num; i++) {
- int tx = shader_x + (shader_w / num) * i;
- int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num;
-
- DeviceTask task = *this;
-
- task.shader_x = tx;
- task.shader_w = tw;
-
- tasks.push_back(task);
- }
- }
- else if (type == RENDER) {
- for (int i = 0; i < num; i++)
- tasks.push_back(*this);
- }
- else {
- for (int i = 0; i < num; i++) {
- int ty = y + (h / num) * i;
- int th = (i == num - 1) ? h - i * (h / num) : h / num;
-
- DeviceTask task = *this;
-
- task.y = ty;
- task.h = th;
-
- tasks.push_back(task);
- }
- }
-}
-
-void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
-{
- if (type == FILM_CONVERT)
- return;
-
- if (update_progress_sample) {
- if (pixel_samples == -1) {
- pixel_samples = shader_w;
- }
- update_progress_sample(pixel_samples, rtile ? rtile->sample : 0);
- }
-
- if (update_tile_sample) {
- double current_time = time_dt();
-
- if (current_time - last_update_time >= 1.0) {
- update_tile_sample(*rtile);
-
- last_update_time = current_time;
- }
- }
-}
-
-/* Adaptive Sampling */
-
-AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0)
-{
-}
-
-/* Render samples in steps that align with the adaptive filtering. */
-int AdaptiveSampling::align_samples(int sample, int num_samples) const
-{
- int end_sample = sample + num_samples;
-
- /* Round down end sample to the nearest sample that needs filtering. */
- end_sample &= ~(adaptive_step - 1);
-
- if (end_sample <= sample) {
- /* In order to reach the next sample that needs filtering, we'd need
- * to increase num_samples. We don't do that in this function, so
- * just keep it as is and don't filter this time around. */
- return num_samples;
- }
- return end_sample - sample;
-}
-
-bool AdaptiveSampling::need_filter(int sample) const
-{
- if (sample > min_samples) {
- return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
- }
- else {
- return false;
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
deleted file mode 100644
index 3f7cf47b692..00000000000
--- a/intern/cycles/device/device_task.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_TASK_H__
-#define __DEVICE_TASK_H__
-
-#include "device/device_memory.h"
-
-#include "util/util_function.h"
-#include "util/util_list.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-class Device;
-class RenderBuffers;
-class RenderTile;
-class RenderTileNeighbors;
-class Tile;
-
-enum DenoiserType {
- DENOISER_NLM = 1,
- DENOISER_OPTIX = 2,
- DENOISER_OPENIMAGEDENOISE = 4,
- DENOISER_NUM,
-
- DENOISER_NONE = 0,
- DENOISER_ALL = ~0,
-};
-
-enum DenoiserInput {
- DENOISER_INPUT_RGB = 1,
- DENOISER_INPUT_RGB_ALBEDO = 2,
- DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
-
- DENOISER_INPUT_NUM,
-};
-
-typedef int DenoiserTypeMask;
-
-class DenoiseParams {
- public:
- /* Apply denoiser to image. */
- bool use;
- /* Output denoising data passes (possibly without applying the denoiser). */
- bool store_passes;
-
- /* Denoiser type. */
- DenoiserType type;
-
- /* Viewport start sample. */
- int start_sample;
-
- /** Native Denoiser. */
-
- /* Pixel radius for neighboring pixels to take into account. */
- int radius;
- /* Controls neighbor pixel weighting for the denoising filter. */
- float strength;
- /* Preserve more or less detail based on feature passes. */
- float feature_strength;
- /* When removing pixels that don't carry information,
- * use a relative threshold instead of an absolute one. */
- bool relative_pca;
- /* How many frames before and after the current center frame are included. */
- int neighbor_frames;
- /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
- bool clamp_input;
-
- /** OIDN/Optix Denoiser. */
-
- /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */
- DenoiserInput input_passes;
-
- DenoiseParams()
- {
- use = false;
- store_passes = false;
-
- type = DENOISER_NLM;
-
- radius = 8;
- strength = 0.5f;
- feature_strength = 0.5f;
- relative_pca = false;
- neighbor_frames = 2;
- clamp_input = true;
-
- /* Default to color + albedo only, since normal input does not always have the desired effect
- * when denoising with OptiX. */
- input_passes = DENOISER_INPUT_RGB_ALBEDO;
-
- start_sample = 0;
- }
-
- /* Test if a denoising task needs to run, also to prefilter passes for the native
- * denoiser when we are not applying denoising to the combined image. */
- bool need_denoising_task() const
- {
- return (use || (store_passes && type == DENOISER_NLM));
- }
-};
-
-class AdaptiveSampling {
- public:
- AdaptiveSampling();
-
- int align_samples(int sample, int num_samples) const;
- bool need_filter(int sample) const;
-
- bool use;
- int adaptive_step;
- int min_samples;
-};
-
-class DeviceTask {
- public:
- typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
- Type type;
-
- int x, y, w, h;
- device_ptr rgba_byte;
- device_ptr rgba_half;
- device_ptr buffer;
- int sample;
- int num_samples;
- int offset, stride;
-
- device_ptr shader_input;
- device_ptr shader_output;
- int shader_eval_type;
- int shader_filter;
- int shader_x, shader_w;
-
- RenderBuffers *buffers;
-
- explicit DeviceTask(Type type = RENDER);
-
- int get_subtask_count(int num, int max_size = 0) const;
- void split(list<DeviceTask> &tasks, int num, int max_size = 0) const;
-
- void update_progress(RenderTile *rtile, int pixel_samples = -1);
-
- function<bool(Device *device, RenderTile &, uint)> acquire_tile;
- function<void(long, int)> update_progress_sample;
- function<void(RenderTile &)> update_tile_sample;
- function<void(RenderTile &)> release_tile;
- function<bool()> get_cancel;
- function<bool()> get_tile_stolen;
- function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
- function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;
-
- uint tile_types;
- DenoiseParams denoising;
- bool denoising_from_render;
- vector<int> denoising_frames;
-
- int pass_stride;
- int frame_stride;
- int target_pass_stride;
- int pass_denoising_data;
- int pass_denoising_clean;
-
- bool need_finish_queue;
- bool integrator_branched;
- AdaptiveSampling adaptive_sampling;
-
- protected:
- double last_update_time;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_TASK_H__ */
diff --git a/intern/cycles/device/device_dummy.cpp b/intern/cycles/device/dummy/device.cpp
index 5112fc152e5..678276ed025 100644
--- a/intern/cycles/device/device_dummy.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -14,8 +14,10 @@
* limitations under the License.
*/
+#include "device/dummy/device.h"
+
#include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
CCL_NAMESPACE_BEGIN
@@ -23,8 +25,8 @@ CCL_NAMESPACE_BEGIN
class DummyDevice : public Device {
public:
- DummyDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
- : Device(info_, stats_, profiler_, background_)
+ DummyDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+ : Device(info_, stats_, profiler_)
{
error_msg = info.error_msg;
}
@@ -61,23 +63,11 @@ class DummyDevice : public Device {
virtual void const_copy_to(const char *, void *, size_t) override
{
}
-
- virtual void task_add(DeviceTask &) override
- {
- }
-
- virtual void task_wait() override
- {
- }
-
- virtual void task_cancel() override
- {
- }
};
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
{
- return new DummyDevice(info, stats, profiler, background);
+ return new DummyDevice(info, stats, profiler);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/dummy/device.h b/intern/cycles/device/dummy/device.h
new file mode 100644
index 00000000000..832a9568129
--- /dev/null
+++ b/intern/cycles/device/dummy/device.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
new file mode 100644
index 00000000000..6dbcce2d9a5
--- /dev/null
+++ b/intern/cycles/device/multi/device.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/multi/device.h"
+
+#include <sstream>
+#include <stdlib.h>
+
+#include "bvh/bvh_multi.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "render/buffers.h"
+#include "render/geometry.h"
+
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+class MultiDevice : public Device {
+ public:
+ struct SubDevice {
+ Stats stats;
+ Device *device;
+ map<device_ptr, device_ptr> ptr_map;
+ int peer_island_index = -1;
+ };
+
+ list<SubDevice> devices;
+ device_ptr unique_key;
+ vector<vector<SubDevice *>> peer_islands;
+
+ MultiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+ : Device(info, stats, profiler), unique_key(1)
+ {
+ foreach (const DeviceInfo &subinfo, info.multi_devices) {
+ /* Always add CPU devices at the back since GPU devices can change
+ * host memory pointers, which CPU uses as device pointer. */
+ SubDevice *sub;
+ if (subinfo.type == DEVICE_CPU) {
+ devices.emplace_back();
+ sub = &devices.back();
+ }
+ else {
+ devices.emplace_front();
+ sub = &devices.front();
+ }
+
+ /* The pointer to 'sub->stats' will stay valid even after new devices
+ * are added, since 'devices' is a linked list. */
+ sub->device = Device::create(subinfo, sub->stats, profiler);
+ }
+
+ /* Build a list of peer islands for the available render devices */
+ foreach (SubDevice &sub, devices) {
+ /* First ensure that every device is in at least once peer island */
+ if (sub.peer_island_index < 0) {
+ peer_islands.emplace_back();
+ sub.peer_island_index = (int)peer_islands.size() - 1;
+ peer_islands[sub.peer_island_index].push_back(&sub);
+ }
+
+ if (!info.has_peer_memory) {
+ continue;
+ }
+
+ /* Second check peer access between devices and fill up the islands accordingly */
+ foreach (SubDevice &peer_sub, devices) {
+ if (peer_sub.peer_island_index < 0 &&
+ peer_sub.device->info.type == sub.device->info.type &&
+ peer_sub.device->check_peer_access(sub.device)) {
+ peer_sub.peer_island_index = sub.peer_island_index;
+ peer_islands[sub.peer_island_index].push_back(&peer_sub);
+ }
+ }
+ }
+ }
+
+ ~MultiDevice()
+ {
+ foreach (SubDevice &sub, devices)
+ delete sub.device;
+ }
+
+ const string &error_message() override
+ {
+ error_msg.clear();
+
+ foreach (SubDevice &sub, devices)
+ error_msg += sub.device->error_message();
+
+ return error_msg;
+ }
+
+ virtual bool show_samples() const override
+ {
+ if (devices.size() > 1) {
+ return false;
+ }
+ return devices.front().device->show_samples();
+ }
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const override
+ {
+ BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+ BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
+ foreach (const SubDevice &sub_device, devices) {
+ BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+ bvh_layout_mask &= device_bvh_layout_mask;
+ bvh_layout_mask_all |= device_bvh_layout_mask;
+ }
+
+ /* With multiple OptiX devices, every device needs its own acceleration structure */
+ if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
+ return BVH_LAYOUT_MULTI_OPTIX;
+ }
+
+ /* When devices do not share a common BVH layout, fall back to creating one for each */
+ const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
+ if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
+ return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
+ }
+
+ return bvh_layout_mask;
+ }
+
+ bool load_kernels(const uint kernel_features) override
+ {
+ foreach (SubDevice &sub, devices)
+ if (!sub.device->load_kernels(kernel_features))
+ return false;
+
+ return true;
+ }
+
+ void build_bvh(BVH *bvh, Progress &progress, bool refit) override
+ {
+ /* Try to build and share a single acceleration structure, if possible */
+ if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
+ devices.back().device->build_bvh(bvh, progress, refit);
+ return;
+ }
+
+ assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
+ bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
+
+ BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
+ bvh_multi->sub_bvhs.resize(devices.size());
+
+ vector<BVHMulti *> geom_bvhs;
+ geom_bvhs.reserve(bvh->geometry.size());
+ foreach (Geometry *geom, bvh->geometry) {
+ geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
+ }
+
+ /* Broadcast acceleration structure build to all render devices */
+ size_t i = 0;
+ foreach (SubDevice &sub, devices) {
+ /* Change geometry BVH pointers to the sub BVH */
+ for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+ bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
+ }
+
+ if (!bvh_multi->sub_bvhs[i]) {
+ BVHParams params = bvh->params;
+ if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
+ params.bvh_layout = BVH_LAYOUT_OPTIX;
+ else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
+ params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
+ BVH_LAYOUT_EMBREE;
+
+ /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
+ * (since they are put into the top level directly, see bvh_embree.cpp) */
+ if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
+ !bvh->geometry[0]->is_instanced()) {
+ i++;
+ continue;
+ }
+
+ bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
+ }
+
+ sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
+ i++;
+ }
+
+ /* Change geometry BVH pointers back to the multi BVH. */
+ for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+ bvh->geometry[k]->bvh = geom_bvhs[k];
+ }
+ }
+
+ virtual void *get_cpu_osl_memory() override
+ {
+ if (devices.size() > 1) {
+ return NULL;
+ }
+ return devices.front().device->get_cpu_osl_memory();
+ }
+
+ bool is_resident(device_ptr key, Device *sub_device) override
+ {
+ foreach (SubDevice &sub, devices) {
+ if (sub.device == sub_device) {
+ return find_matching_mem_device(key, sub)->device == sub_device;
+ }
+ }
+ return false;
+ }
+
+ SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+ {
+ assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
+
+ /* Get the memory owner of this key (first try current device, then peer devices) */
+ SubDevice *owner_sub = &sub;
+ if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+ foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+ if (island_sub != owner_sub &&
+ island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+ owner_sub = island_sub;
+ }
+ }
+ }
+ return owner_sub;
+ }
+
+ SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+ {
+ assert(!island.empty());
+
+ /* Get the memory owner of this key or the device with the lowest memory usage when new */
+ SubDevice *owner_sub = island.front();
+ foreach (SubDevice *island_sub, island) {
+ if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+ (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+ owner_sub = island_sub;
+ }
+ }
+ return owner_sub;
+ }
+
+ inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+ {
+ return find_matching_mem_device(key, sub)->ptr_map[key];
+ }
+
+ void mem_alloc(device_memory &mem) override
+ {
+ device_ptr key = unique_key++;
+
+ assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || mem.type == MEM_DEVICE_ONLY);
+ /* The remaining memory types can be distributed across devices */
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_suitable_mem_device(key, island);
+ mem.device = owner_sub->device;
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ owner_sub->device->mem_alloc(mem);
+ owner_sub->ptr_map[key] = mem.device_pointer;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ stats.mem_alloc(mem.device_size);
+ }
+
+ void mem_copy_to(device_memory &mem) override
+ {
+ device_ptr existing_key = mem.device_pointer;
+ device_ptr key = (existing_key) ? existing_key : unique_key++;
+ size_t existing_size = mem.device_size;
+
+ /* The tile buffers are allocated on each device (see below), so copy to all of them */
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+ mem.device = owner_sub->device;
+ mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+ mem.device_size = existing_size;
+
+ owner_sub->device->mem_copy_to(mem);
+ owner_sub->ptr_map[key] = mem.device_pointer;
+
+ if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+ /* Need to create texture objects and update pointer in kernel globals on all devices */
+ foreach (SubDevice *island_sub, island) {
+ if (island_sub != owner_sub) {
+ island_sub->device->mem_copy_to(mem);
+ }
+ }
+ }
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ stats.mem_alloc(mem.device_size - existing_size);
+ }
+
+ void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+ {
+ device_ptr key = mem.device_pointer;
+ int i = 0, sub_h = h / devices.size();
+
+ foreach (SubDevice &sub, devices) {
+ int sy = y + i * sub_h;
+ int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+
+ SubDevice *owner_sub = find_matching_mem_device(key, sub);
+ mem.device = owner_sub->device;
+ mem.device_pointer = owner_sub->ptr_map[key];
+
+ owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
+ i++;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ }
+
+ void mem_zero(device_memory &mem) override
+ {
+ device_ptr existing_key = mem.device_pointer;
+ device_ptr key = (existing_key) ? existing_key : unique_key++;
+ size_t existing_size = mem.device_size;
+
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+ mem.device = owner_sub->device;
+ mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+ mem.device_size = existing_size;
+
+ owner_sub->device->mem_zero(mem);
+ owner_sub->ptr_map[key] = mem.device_pointer;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ stats.mem_alloc(mem.device_size - existing_size);
+ }
+
+ void mem_free(device_memory &mem) override
+ {
+ device_ptr key = mem.device_pointer;
+ size_t existing_size = mem.device_size;
+
+ /* Free memory that was allocated for all devices (see above) on each device */
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+ mem.device = owner_sub->device;
+ mem.device_pointer = owner_sub->ptr_map[key];
+ mem.device_size = existing_size;
+
+ owner_sub->device->mem_free(mem);
+ owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+ if (mem.type == MEM_TEXTURE) {
+ /* Free texture objects on all devices */
+ foreach (SubDevice *island_sub, island) {
+ if (island_sub != owner_sub) {
+ island_sub->device->mem_free(mem);
+ }
+ }
+ }
+ }
+
+ mem.device = this;
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+ stats.mem_free(existing_size);
+ }
+
+ void const_copy_to(const char *name, void *host, size_t size) override
+ {
+ foreach (SubDevice &sub, devices)
+ sub.device->const_copy_to(name, host, size);
+ }
+
+ int device_number(Device *sub_device) override
+ {
+ int i = 0;
+
+ foreach (SubDevice &sub, devices) {
+ if (sub.device == sub_device)
+ return i;
+ i++;
+ }
+
+ return -1;
+ }
+
+ virtual void foreach_device(const function<void(Device *)> &callback) override
+ {
+ foreach (SubDevice &sub, devices) {
+ sub.device->foreach_device(callback);
+ }
+ }
+};
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+ return new MultiDevice(info, stats, profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/multi/device.h b/intern/cycles/device/multi/device.h
new file mode 100644
index 00000000000..6e121014a1f
--- /dev/null
+++ b/intern/cycles/device/multi/device.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
deleted file mode 100644
index a65e764b0d4..00000000000
--- a/intern/cycles/device/opencl/device_opencl.h
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-# include "device/device.h"
-# include "device/device_denoising.h"
-# include "device/device_split_kernel.h"
-
-# include "util/util_map.h"
-# include "util/util_param.h"
-# include "util/util_string.h"
-# include "util/util_task.h"
-
-# include "clew.h"
-
-# include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Disable workarounds, seems to be working fine on latest drivers. */
-# define CYCLES_DISABLE_DRIVER_WORKAROUNDS
-
-/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workarounds for testing. */
-# ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
-/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
-# undef clEnqueueNDRangeKernel
-# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
- CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
- clFinish(a);
-
-# undef clEnqueueWriteBuffer
-# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
- CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
- clFinish(a);
-
-# undef clEnqueueReadBuffer
-# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
- CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
- clFinish(a);
-# endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
-
-# define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-struct OpenCLPlatformDevice {
- OpenCLPlatformDevice(cl_platform_id platform_id,
- const string &platform_name,
- cl_device_id device_id,
- cl_device_type device_type,
- const string &device_name,
- const string &hardware_id,
- const string &device_extensions)
- : platform_id(platform_id),
- platform_name(platform_name),
- device_id(device_id),
- device_type(device_type),
- device_name(device_name),
- hardware_id(hardware_id),
- device_extensions(device_extensions)
- {
- }
- cl_platform_id platform_id;
- string platform_name;
- cl_device_id device_id;
- cl_device_type device_type;
- string device_name;
- string hardware_id;
- string device_extensions;
-};
-
-/* Contains all static OpenCL helper functions. */
-class OpenCLInfo {
- public:
- static cl_device_type device_type();
- static bool use_debug();
- static bool device_supported(const string &platform_name, const cl_device_id device_id);
- static bool platform_version_check(cl_platform_id platform, string *error = NULL);
- static bool device_version_check(cl_device_id device, string *error = NULL);
- static bool get_device_version(cl_device_id device,
- int *r_major,
- int *r_minor,
- string *error = NULL);
- static string get_hardware_id(const string &platform_name, cl_device_id device_id);
- static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices);
-
- /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
-
- /* Platform information. */
- static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
- static cl_uint get_num_platforms();
-
- static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL);
- static vector<cl_platform_id> get_platforms();
-
- static bool get_platform_name(cl_platform_id platform_id, string *platform_name);
- static string get_platform_name(cl_platform_id platform_id);
-
- static bool get_num_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type,
- cl_uint *num_devices,
- cl_int *error = NULL);
- static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type);
-
- static bool get_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type,
- vector<cl_device_id> *device_ids,
- cl_int *error = NULL);
- static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type);
-
- /* Device information. */
- static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL);
-
- static string get_device_name(cl_device_id device_id);
-
- static bool get_device_extensions(cl_device_id device_id,
- string *device_extensions,
- cl_int *error = NULL);
-
- static string get_device_extensions(cl_device_id device_id);
-
- static bool get_device_type(cl_device_id device_id,
- cl_device_type *device_type,
- cl_int *error = NULL);
- static cl_device_type get_device_type(cl_device_id device_id);
-
- static bool get_driver_version(cl_device_id device_id,
- int *major,
- int *minor,
- cl_int *error = NULL);
-
- static int mem_sub_ptr_alignment(cl_device_id device_id);
-
- /* Get somewhat more readable device name.
- * Main difference is AMD OpenCL here which only gives code name
- * for the regular device name. This will give more sane device
- * name using some extensions.
- */
- static string get_readable_device_name(cl_device_id device_id);
-};
-
-/* Thread safe cache for contexts and programs.
- */
-class OpenCLCache {
- struct Slot {
- struct ProgramEntry {
- ProgramEntry();
- ProgramEntry(const ProgramEntry &rhs);
- ~ProgramEntry();
- cl_program program;
- thread_mutex *mutex;
- };
-
- Slot();
- Slot(const Slot &rhs);
- ~Slot();
-
- thread_mutex *context_mutex;
- cl_context context;
- typedef map<ustring, ProgramEntry> EntryMap;
- EntryMap programs;
- };
-
- /* key is combination of platform ID and device ID */
- typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
-
- /* map of Slot objects */
- typedef map<PlatformDevicePair, Slot> CacheMap;
- CacheMap cache;
-
- /* MD5 hash of the kernel source. */
- string kernel_md5;
-
- thread_mutex cache_lock;
- thread_mutex kernel_md5_lock;
-
- /* lazy instantiate */
- static OpenCLCache &global_instance();
-
- public:
- enum ProgramName {
- OCL_DEV_BASE_PROGRAM,
- OCL_DEV_MEGAKERNEL_PROGRAM,
- };
-
- /* Lookup context in the cache. If this returns NULL, slot_locker
- * will be holding a lock for the cache. slot_locker should refer to a
- * default constructed thread_scoped_lock. */
- static cl_context get_context(cl_platform_id platform,
- cl_device_id device,
- thread_scoped_lock &slot_locker);
- /* Same as above. */
- static cl_program get_program(cl_platform_id platform,
- cl_device_id device,
- ustring key,
- thread_scoped_lock &slot_locker);
-
- /* Store context in the cache. You MUST have tried to get the item before storing to it. */
- static void store_context(cl_platform_id platform,
- cl_device_id device,
- cl_context context,
- thread_scoped_lock &slot_locker);
- /* Same as above. */
- static void store_program(cl_platform_id platform,
- cl_device_id device,
- cl_program program,
- ustring key,
- thread_scoped_lock &slot_locker);
-
- static string get_kernel_md5();
-};
-
-# define opencl_device_assert(device, stmt) \
- { \
- cl_int err = stmt; \
-\
- if (err != CL_SUCCESS) { \
- string message = string_printf( \
- "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
- if ((device)->error_message() == "") { \
- (device)->set_error(message); \
- } \
- fprintf(stderr, "%s\n", message.c_str()); \
- } \
- } \
- (void)0
-
-# define opencl_assert(stmt) \
- { \
- cl_int err = stmt; \
-\
- if (err != CL_SUCCESS) { \
- string message = string_printf( \
- "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
- if (error_msg == "") { \
- error_msg = message; \
- } \
- fprintf(stderr, "%s\n", message.c_str()); \
- } \
- } \
- (void)0
-
-class OpenCLDevice : public Device {
- public:
- DedicatedTaskPool task_pool;
-
- /* Task pool for required kernels (base, AO kernels during foreground rendering) */
- TaskPool load_required_kernel_task_pool;
- /* Task pool for optional kernels (feature kernels during foreground rendering) */
- TaskPool load_kernel_task_pool;
- std::atomic<int> load_kernel_num_compiling;
-
- cl_context cxContext;
- cl_command_queue cqCommandQueue;
- cl_platform_id cpPlatform;
- cl_device_id cdDevice;
- cl_int ciErr;
- int device_num;
-
- class OpenCLProgram {
- public:
- OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL)
- {
- }
- OpenCLProgram(OpenCLDevice *device,
- const string &program_name,
- const string &kernel_name,
- const string &kernel_build_options,
- bool use_stdout = true);
- ~OpenCLProgram();
-
- void add_kernel(ustring name);
-
- /* Try to load the program from device cache or disk */
- bool load();
- /* Compile the kernel (first separate, fail-back to local). */
- void compile();
- /* Create the OpenCL kernels after loading or compiling */
- void create_kernels();
-
- bool is_loaded() const
- {
- return loaded;
- }
- const string &get_log() const
- {
- return log;
- }
- void report_error();
-
- /* Wait until this kernel is available to be used
- * It will return true when the kernel is available.
- * It will return false when the kernel is not available
- * or could not be loaded. */
- bool wait_for_availability();
-
- cl_kernel operator()();
- cl_kernel operator()(ustring name);
-
- void release();
-
- private:
- bool build_kernel(const string *debug_src);
- /* Build the program by calling the own process.
- * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
- * build calls internally if they come from the same process.
- * If that is not supported, this function just returns false.
- */
- bool compile_separate(const string &clbin);
- /* Build the program by calling OpenCL directly. */
- bool compile_kernel(const string *debug_src);
- /* Loading and saving the program from/to disk. */
- bool load_binary(const string &clbin, const string *debug_src = NULL);
- bool save_binary(const string &clbin);
-
- void add_log(const string &msg, bool is_debug);
- void add_error(const string &msg);
-
- bool loaded;
- bool needs_compiling;
-
- cl_program program;
- OpenCLDevice *device;
-
- /* Used for the OpenCLCache key. */
- string program_name;
-
- string kernel_file, kernel_build_options, device_md5;
-
- bool use_stdout;
- string log, error_msg;
- string compile_output;
-
- map<ustring, cl_kernel> kernels;
- };
-
- /* Container for all types of split programs. */
- class OpenCLSplitPrograms {
- public:
- OpenCLDevice *device;
- OpenCLProgram program_split;
- OpenCLProgram program_lamp_emission;
- OpenCLProgram program_do_volume;
- OpenCLProgram program_indirect_background;
- OpenCLProgram program_shader_eval;
- OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
- OpenCLProgram program_subsurface_scatter;
- OpenCLProgram program_direct_lighting;
- OpenCLProgram program_shadow_blocked_ao;
- OpenCLProgram program_shadow_blocked_dl;
-
- OpenCLSplitPrograms(OpenCLDevice *device);
- ~OpenCLSplitPrograms();
-
- /* Load the kernels and put the created kernels in the given
- * `programs` parameter. */
- void load_kernels(vector<OpenCLProgram *> &programs,
- const DeviceRequestedFeatures &requested_features);
- };
-
- DeviceSplitKernel *split_kernel;
-
- OpenCLProgram base_program;
- OpenCLProgram bake_program;
- OpenCLProgram displace_program;
- OpenCLProgram background_program;
- OpenCLProgram denoising_program;
-
- OpenCLSplitPrograms kernel_programs;
-
- typedef map<string, device_vector<uchar> *> ConstMemMap;
- typedef map<string, device_ptr> MemMap;
-
- ConstMemMap const_mem_map;
- MemMap mem_map;
-
- bool device_initialized;
- string platform_name;
- string device_name;
-
- bool opencl_error(cl_int err);
- void opencl_error(const string &message);
- void opencl_assert_err(cl_int err, const char *where);
-
- OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
- ~OpenCLDevice();
-
- static void CL_CALLBACK context_notify_callback(const char *err_info,
- const void * /*private_info*/,
- size_t /*cb*/,
- void *user_data);
-
- bool opencl_version_check();
- OpenCLSplitPrograms *get_split_programs();
-
- string device_md5_hash(string kernel_custom_build_options = "");
- bool load_kernels(const DeviceRequestedFeatures &requested_features);
- void load_required_kernels(const DeviceRequestedFeatures &requested_features);
-
- bool wait_for_availability(const DeviceRequestedFeatures &requested_features);
- DeviceKernelStatus get_active_kernel_switch_state();
-
- /* Get the name of the opencl program for the given kernel */
- const string get_opencl_program_name(const string &kernel_name);
- /* Get the program file name to compile (*.cl) for the given kernel */
- const string get_opencl_program_filename(const string &kernel_name);
- string get_build_options(const DeviceRequestedFeatures &requested_features,
- const string &opencl_program_name);
- /* Enable the default features to reduce recompilation events */
- void enable_default_features(DeviceRequestedFeatures &features);
-
- void mem_alloc(device_memory &mem);
- void mem_copy_to(device_memory &mem);
- void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
- void mem_zero(device_memory &mem);
- void mem_free(device_memory &mem);
-
- int mem_sub_ptr_alignment();
-
- void const_copy_to(const char *name, void *host, size_t size);
- void global_alloc(device_memory &mem);
- void global_free(device_memory &mem);
- void tex_alloc(device_texture &mem);
- void tex_free(device_texture &mem);
-
- size_t global_size_round_up(int group_size, int global_size);
- void enqueue_kernel(cl_kernel kernel,
- size_t w,
- size_t h,
- bool x_workgroups = false,
- size_t max_workgroup_size = -1);
- void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
- void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-
- void film_convert(DeviceTask &task,
- device_ptr buffer,
- device_ptr rgba_byte,
- device_ptr rgba_half);
- void shader(DeviceTask &task);
- void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
- void bake(DeviceTask &task, RenderTile &tile);
-
- void denoise(RenderTile &tile, DenoisingTask &denoising);
-
- int get_split_task_count(DeviceTask & /*task*/)
- {
- return 1;
- }
-
- void task_add(DeviceTask &task)
- {
- task_pool.push([=] {
- DeviceTask task_copy = task;
- thread_run(task_copy);
- });
- }
-
- void task_wait()
- {
- task_pool.wait();
- }
-
- void task_cancel()
- {
- task_pool.cancel();
- }
-
- void thread_run(DeviceTask &task);
-
- virtual BVHLayoutMask get_bvh_layout_mask() const
- {
- return BVH_LAYOUT_BVH2;
- }
-
- virtual bool show_samples() const
- {
- return true;
- }
-
- protected:
- string kernel_build_options(const string *debug_src = NULL);
-
- void mem_zero_kernel(device_ptr ptr, size_t size);
-
- bool denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task);
- bool denoising_construct_transform(DenoisingTask *task);
- bool denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task);
- bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
- bool denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect,
- DenoisingTask *task);
- bool denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task);
- bool denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task);
- bool denoising_write_feature(int to_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task);
- bool denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task);
-
- device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size);
- void mem_free_sub_ptr(device_ptr ptr);
-
- class ArgumentWrapper {
- public:
- ArgumentWrapper() : size(0), pointer(NULL)
- {
- }
-
- ArgumentWrapper(device_memory &argument)
- : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
- {
- }
-
- template<typename T>
- ArgumentWrapper(device_vector<T> &argument)
- : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
- {
- }
-
- template<typename T>
- ArgumentWrapper(device_only_memory<T> &argument)
- : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
- {
- }
- template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument)
- {
- }
-
- ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value)
- {
- }
-
- ArgumentWrapper(float argument)
- : size(sizeof(float)), float_value(argument), pointer(&float_value)
- {
- }
-
- size_t size;
- int int_value;
- float float_value;
- void *pointer;
- };
-
- /* TODO(sergey): In the future we can use variadic templates, once
- * C++0x is allowed. Should allow to clean this up a bit.
- */
- int kernel_set_args(cl_kernel kernel,
- int start_argument_index,
- const ArgumentWrapper &arg1 = ArgumentWrapper(),
- const ArgumentWrapper &arg2 = ArgumentWrapper(),
- const ArgumentWrapper &arg3 = ArgumentWrapper(),
- const ArgumentWrapper &arg4 = ArgumentWrapper(),
- const ArgumentWrapper &arg5 = ArgumentWrapper(),
- const ArgumentWrapper &arg6 = ArgumentWrapper(),
- const ArgumentWrapper &arg7 = ArgumentWrapper(),
- const ArgumentWrapper &arg8 = ArgumentWrapper(),
- const ArgumentWrapper &arg9 = ArgumentWrapper(),
- const ArgumentWrapper &arg10 = ArgumentWrapper(),
- const ArgumentWrapper &arg11 = ArgumentWrapper(),
- const ArgumentWrapper &arg12 = ArgumentWrapper(),
- const ArgumentWrapper &arg13 = ArgumentWrapper(),
- const ArgumentWrapper &arg14 = ArgumentWrapper(),
- const ArgumentWrapper &arg15 = ArgumentWrapper(),
- const ArgumentWrapper &arg16 = ArgumentWrapper(),
- const ArgumentWrapper &arg17 = ArgumentWrapper(),
- const ArgumentWrapper &arg18 = ArgumentWrapper(),
- const ArgumentWrapper &arg19 = ArgumentWrapper(),
- const ArgumentWrapper &arg20 = ArgumentWrapper(),
- const ArgumentWrapper &arg21 = ArgumentWrapper(),
- const ArgumentWrapper &arg22 = ArgumentWrapper(),
- const ArgumentWrapper &arg23 = ArgumentWrapper(),
- const ArgumentWrapper &arg24 = ArgumentWrapper(),
- const ArgumentWrapper &arg25 = ArgumentWrapper(),
- const ArgumentWrapper &arg26 = ArgumentWrapper(),
- const ArgumentWrapper &arg27 = ArgumentWrapper(),
- const ArgumentWrapper &arg28 = ArgumentWrapper(),
- const ArgumentWrapper &arg29 = ArgumentWrapper(),
- const ArgumentWrapper &arg30 = ArgumentWrapper(),
- const ArgumentWrapper &arg31 = ArgumentWrapper(),
- const ArgumentWrapper &arg32 = ArgumentWrapper(),
- const ArgumentWrapper &arg33 = ArgumentWrapper());
-
- void release_kernel_safe(cl_kernel kernel);
- void release_mem_object_safe(cl_mem mem);
- void release_program_safe(cl_program program);
-
- /* ** Those guys are for working around some compiler-specific bugs ** */
-
- cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker);
-
- void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker);
-
- private:
- MemoryManager memory_manager;
- friend class MemoryManager;
-
- static_assert_align(TextureInfo, 16);
- device_vector<TextureInfo> texture_info;
-
- typedef map<string, device_memory *> TexturesMap;
- TexturesMap textures;
-
- bool textures_need_update;
-
- protected:
- void flush_texture_buffers();
-
- friend class OpenCLSplitKernel;
- friend class OpenCLSplitKernelFunction;
-};
-
-Device *opencl_create_split_device(DeviceInfo &info,
- Stats &stats,
- Profiler &profiler,
- bool background);
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
deleted file mode 100644
index 31a2265700c..00000000000
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ /dev/null
@@ -1,2113 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-# include "device/opencl/device_opencl.h"
-
-# include "kernel/kernel_types.h"
-# include "kernel/split/kernel_split_data_types.h"
-
-# include "util/util_algorithm.h"
-# include "util/util_debug.h"
-# include "util/util_foreach.h"
-# include "util/util_logging.h"
-# include "util/util_md5.h"
-# include "util/util_path.h"
-# include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct texture_slot_t {
- texture_slot_t(const string &name, int slot) : name(name), slot(slot)
- {
- }
- string name;
- int slot;
-};
-
-static const string NON_SPLIT_KERNELS =
- "denoising "
- "base "
- "background "
- "displace ";
-
-static const string SPLIT_BUNDLE_KERNELS =
- "data_init "
- "path_init "
- "state_buffer_size "
- "scene_intersect "
- "queue_enqueue "
- "shader_setup "
- "shader_sort "
- "enqueue_inactive "
- "next_iteration_setup "
- "indirect_subsurface "
- "buffer_update "
- "adaptive_stopping "
- "adaptive_filter_x "
- "adaptive_filter_y "
- "adaptive_adjust_samples";
-
-const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
-{
- if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
- return kernel_name;
- }
- else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
- return "split_bundle";
- }
- else {
- return "split_" + kernel_name;
- }
-}
-
-const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
-{
- if (kernel_name == "denoising") {
- return "filter.cl";
- }
- else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
- return "kernel_split_bundle.cl";
- }
- else {
- return "kernel_" + kernel_name + ".cl";
- }
-}
-
-/* Enable features that we always want to compile to reduce recompilation events */
-void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
-{
- features.use_transparent = true;
- features.use_shadow_tricks = true;
- features.use_principled = true;
- features.use_denoising = true;
-
- if (!background) {
- features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
- features.nodes_features = NODE_FEATURE_ALL;
- features.use_hair = true;
- features.use_subsurface = true;
- features.use_camera_motion = false;
- features.use_object_motion = false;
- }
-}
-
-string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
- const string &opencl_program_name)
-{
- /* first check for non-split kernel programs */
- if (opencl_program_name == "base" || opencl_program_name == "denoising") {
- return "";
- }
- else if (opencl_program_name == "bake") {
- /* Note: get_build_options for bake is only requested when baking is enabled.
- * displace and background are always requested.
- * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
- features.use_denoising = false;
- features.use_object_motion = false;
- features.use_camera_motion = false;
- features.use_hair = true;
- features.use_subsurface = true;
- features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
- features.nodes_features = NODE_FEATURE_ALL;
- features.use_integrator_branched = false;
- return features.get_build_options();
- }
- else if (opencl_program_name == "displace") {
- /* As displacement does not use any nodes from the Shading group (eg BSDF).
- * We disable all features that are related to shading. */
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
- features.use_denoising = false;
- features.use_object_motion = false;
- features.use_camera_motion = false;
- features.use_baking = false;
- features.use_transparent = false;
- features.use_shadow_tricks = false;
- features.use_subsurface = false;
- features.use_volume = false;
- features.nodes_features &= ~NODE_FEATURE_VOLUME;
- features.use_denoising = false;
- features.use_principled = false;
- features.use_integrator_branched = false;
- return features.get_build_options();
- }
- else if (opencl_program_name == "background") {
- /* Background uses Background shading
- * It is save to disable shadow features, subsurface and volumetric. */
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
- features.use_baking = false;
- features.use_object_motion = false;
- features.use_camera_motion = false;
- features.use_transparent = false;
- features.use_shadow_tricks = false;
- features.use_denoising = false;
- /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
- * Perhaps we should remove them in UI as it does not make any sense when
- * rendering background. */
- features.nodes_features &= ~NODE_FEATURE_VOLUME;
- features.use_subsurface = false;
- features.use_volume = false;
- features.use_shader_raytrace = false;
- features.use_patch_evaluation = false;
- features.use_integrator_branched = false;
- return features.get_build_options();
- }
-
- string build_options = "-D__SPLIT_KERNEL__ ";
- /* Set compute device build option. */
- cl_device_type device_type;
- OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
- assert(this->ciErr == CL_SUCCESS);
- if (device_type == CL_DEVICE_TYPE_GPU) {
- build_options += "-D__COMPUTE_DEVICE_GPU__ ";
- }
-
- DeviceRequestedFeatures nofeatures;
- enable_default_features(nofeatures);
-
- /* Add program specific optimized compile directives */
- if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
- build_options += nofeatures.get_build_options();
- }
- else {
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
-
- /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
- * this also makes sure that the kernels that are build during baking can be reused
- * when not doing any baking. */
- features.use_baking = false;
-
- /* Do not vary on shaders when program doesn't do any shading.
- * We have bundled them in a single program. */
- if (opencl_program_name == "split_bundle") {
- features.max_nodes_group = 0;
- features.nodes_features = 0;
- features.use_shader_raytrace = false;
- }
-
- /* No specific settings, just add the regular ones */
- build_options += features.get_build_options();
- }
-
- return build_options;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
-{
- device = device_;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
-{
- program_split.release();
- program_lamp_emission.release();
- program_do_volume.release();
- program_indirect_background.release();
- program_shader_eval.release();
- program_holdout_emission_blurring_pathtermination_ao.release();
- program_subsurface_scatter.release();
- program_direct_lighting.release();
- program_shadow_blocked_ao.release();
- program_shadow_blocked_dl.release();
-}
-
-void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
- vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features)
-{
- if (!requested_features.use_baking) {
-# define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
- program_split.add_kernel(ustring("path_trace_" #kernel_name));
-# define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
- const string program_name_##kernel_name = "split_" #kernel_name; \
- program_##kernel_name = OpenCLDevice::OpenCLProgram( \
- device, \
- program_name_##kernel_name, \
- "kernel_" #kernel_name ".cl", \
- device->get_build_options(requested_features, program_name_##kernel_name)); \
- program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
- programs.push_back(&program_##kernel_name);
-
- /* Ordered with most complex kernels first, to reduce overall compile time. */
- ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
- ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
- ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
- if (requested_features.use_volume) {
- ADD_SPLIT_KERNEL_PROGRAM(do_volume);
- }
- ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
- ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
- ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
- ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
- ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
-
- /* Quick kernels bundled in a single program to reduce overhead of starting
- * Blender processes. */
- program_split = OpenCLDevice::OpenCLProgram(
- device,
- "split_bundle",
- "kernel_split_bundle.cl",
- device->get_build_options(requested_features, "split_bundle"));
-
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
- programs.push_back(&program_split);
-
-# undef ADD_SPLIT_KERNEL_PROGRAM
-# undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
- }
-}
-
-namespace {
-
-/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
- * fetch its size.
- */
-typedef struct KernelGlobalsDummy {
- ccl_constant KernelData *data;
- ccl_global char *buffers[8];
-
-# define KERNEL_TEX(type, name) TextureInfo name;
-# include "kernel/kernel_textures.h"
-# undef KERNEL_TEX
- SplitData split_data;
- SplitParams split_param_data;
-} KernelGlobalsDummy;
-
-} // namespace
-
-struct CachedSplitMemory {
- int id;
- device_memory *split_data;
- device_memory *ray_state;
- device_memory *queue_index;
- device_memory *use_queues_flag;
- device_memory *work_pools;
- device_ptr *buffer;
-};
-
-class OpenCLSplitKernelFunction : public SplitKernelFunction {
- public:
- OpenCLDevice *device;
- OpenCLDevice::OpenCLProgram program;
- CachedSplitMemory &cached_memory;
- int cached_id;
-
- OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
- : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
- {
- }
-
- ~OpenCLSplitKernelFunction()
- {
- program.release();
- }
-
- virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
- {
- if (cached_id != cached_memory.id) {
- cl_uint start_arg_index = device->kernel_set_args(
- program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
-
- device->set_kernel_arg_buffers(program(), &start_arg_index);
-
- start_arg_index += device->kernel_set_args(program(),
- start_arg_index,
- *cached_memory.queue_index,
- *cached_memory.use_queues_flag,
- *cached_memory.work_pools,
- *cached_memory.buffer);
-
- cached_id = cached_memory.id;
- }
-
- device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
- program(),
- 2,
- NULL,
- dim.global_size,
- dim.local_size,
- 0,
- NULL,
- NULL);
-
- device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
- if (device->ciErr != CL_SUCCESS) {
- string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
- clewErrorString(device->ciErr));
- device->opencl_error(message);
- return false;
- }
-
- return true;
- }
-};
-
-class OpenCLSplitKernel : public DeviceSplitKernel {
- OpenCLDevice *device;
- CachedSplitMemory cached_memory;
-
- public:
- explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
- {
- }
-
- virtual SplitKernelFunction *get_split_kernel_function(
- const string &kernel_name, const DeviceRequestedFeatures &requested_features)
- {
- OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
-
- const string program_name = device->get_opencl_program_name(kernel_name);
- kernel->program = OpenCLDevice::OpenCLProgram(
- device,
- program_name,
- device->get_opencl_program_filename(kernel_name),
- device->get_build_options(requested_features, program_name));
-
- kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
- kernel->program.load();
-
- if (!kernel->program.is_loaded()) {
- delete kernel;
- return NULL;
- }
-
- return kernel;
- }
-
- virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
- {
- device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
- size_buffer.alloc(1);
- size_buffer.zero_to_device();
-
- uint threads = num_threads;
- OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
- cl_kernel kernel_state_buffer_size = programs->program_split(
- ustring("path_trace_state_buffer_size"));
- device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
-
- size_t global_size = 64;
- device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
- kernel_state_buffer_size,
- 1,
- NULL,
- &global_size,
- NULL,
- 0,
- NULL,
- NULL);
-
- device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
- size_buffer.copy_from_device(0, 1, 1);
- size_t size = size_buffer[0];
- size_buffer.free();
-
- if (device->ciErr != CL_SUCCESS) {
- string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
- clewErrorString(device->ciErr));
- device->opencl_error(message);
- return 0;
- }
-
- return size;
- }
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory &kernel_globals,
- device_memory &kernel_data,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flag,
- device_memory &work_pool_wgs)
- {
- cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
-
- /* Set the range of samples to be processed for every ray in
- * path-regeneration logic.
- */
- cl_int start_sample = rtile.start_sample;
- cl_int end_sample = rtile.start_sample + rtile.num_samples;
-
- OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
- cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
-
- cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
- 0,
- kernel_globals,
- kernel_data,
- split_data,
- num_global_elements,
- ray_state);
-
- device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
-
- start_arg_index += device->kernel_set_args(kernel_data_init,
- start_arg_index,
- start_sample,
- end_sample,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- rtile.offset,
- rtile.stride,
- queue_index,
- dQueue_size,
- use_queues_flag,
- work_pool_wgs,
- rtile.num_samples,
- rtile.buffer);
-
- /* Enqueue ckPathTraceKernel_data_init kernel. */
- device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
- kernel_data_init,
- 2,
- NULL,
- dim.global_size,
- dim.local_size,
- 0,
- NULL,
- NULL);
-
- device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
- if (device->ciErr != CL_SUCCESS) {
- string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
- clewErrorString(device->ciErr));
- device->opencl_error(message);
- return false;
- }
-
- cached_memory.split_data = &split_data;
- cached_memory.ray_state = &ray_state;
- cached_memory.queue_index = &queue_index;
- cached_memory.use_queues_flag = &use_queues_flag;
- cached_memory.work_pools = &work_pool_wgs;
- cached_memory.buffer = &rtile.buffer;
- cached_memory.id++;
-
- return true;
- }
-
- virtual int2 split_kernel_local_size()
- {
- return make_int2(64, 1);
- }
-
- virtual int2 split_kernel_global_size(device_memory &kg,
- device_memory &data,
- DeviceTask & /*task*/)
- {
- cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
- /* Use small global size on CPU devices as it seems to be much faster. */
- if (type == CL_DEVICE_TYPE_CPU) {
- VLOG(1) << "Global size: (64, 64).";
- return make_int2(64, 64);
- }
-
- cl_ulong max_buffer_size;
- clGetDeviceInfo(
- device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
- if (DebugFlags().opencl.mem_limit) {
- max_buffer_size = min(max_buffer_size,
- cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
- }
-
- VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
- << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
-
- /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
- max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
-
- size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
- int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
- (int)sqrt(num_elements));
-
- if (device->info.description.find("Intel") != string::npos) {
- global_size = make_int2(min(512, global_size.x), min(512, global_size.y));
- }
-
- VLOG(1) << "Global size: " << global_size << ".";
- return global_size;
- }
-};
-
-bool OpenCLDevice::opencl_error(cl_int err)
-{
- if (err != CL_SUCCESS) {
- string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
- if (error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
- return true;
- }
-
- return false;
-}
-
-void OpenCLDevice::opencl_error(const string &message)
-{
- if (error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
-}
-
-void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
-{
- if (err != CL_SUCCESS) {
- string message = string_printf(
- "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
- if (error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
-# ifndef NDEBUG
- abort();
-# endif
- }
-}
-
-OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
- : Device(info, stats, profiler, background),
- load_kernel_num_compiling(0),
- kernel_programs(this),
- memory_manager(this),
- texture_info(this, "__texture_info", MEM_GLOBAL)
-{
- cpPlatform = NULL;
- cdDevice = NULL;
- cxContext = NULL;
- cqCommandQueue = NULL;
- device_initialized = false;
- textures_need_update = true;
-
- vector<OpenCLPlatformDevice> usable_devices;
- OpenCLInfo::get_usable_devices(&usable_devices);
- if (usable_devices.size() == 0) {
- opencl_error("OpenCL: no devices found.");
- return;
- }
- assert(info.num < usable_devices.size());
- OpenCLPlatformDevice &platform_device = usable_devices[info.num];
- device_num = info.num;
- cpPlatform = platform_device.platform_id;
- cdDevice = platform_device.device_id;
- platform_name = platform_device.platform_name;
- device_name = platform_device.device_name;
- VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
- << device_name << ".";
-
- {
- /* try to use cached context */
- thread_scoped_lock cache_locker;
- cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
-
- if (cxContext == NULL) {
- /* create context properties array to specify platform */
- const cl_context_properties context_props[] = {
- CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
-
- /* create context */
- cxContext = clCreateContext(
- context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
-
- if (opencl_error(ciErr)) {
- opencl_error("OpenCL: clCreateContext failed");
- return;
- }
-
- /* cache it */
- OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
- }
- }
-
- cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
- if (opencl_error(ciErr)) {
- opencl_error("OpenCL: Error creating command queue");
- return;
- }
-
- /* Allocate this right away so that texture_info
- * is placed at offset 0 in the device memory buffers. */
- texture_info.resize(1);
- memory_manager.alloc("texture_info", texture_info);
-
- device_initialized = true;
-
- split_kernel = new OpenCLSplitKernel(this);
-}
-
-OpenCLDevice::~OpenCLDevice()
-{
- task_pool.cancel();
- load_required_kernel_task_pool.cancel();
- load_kernel_task_pool.cancel();
-
- memory_manager.free();
-
- ConstMemMap::iterator mt;
- for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
- delete mt->second;
- }
-
- base_program.release();
- bake_program.release();
- displace_program.release();
- background_program.release();
- denoising_program.release();
-
- if (cqCommandQueue)
- clReleaseCommandQueue(cqCommandQueue);
- if (cxContext)
- clReleaseContext(cxContext);
-
- delete split_kernel;
-}
-
-void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
- const void * /*private_info*/,
- size_t /*cb*/,
- void *user_data)
-{
- string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
- fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
-}
-
-bool OpenCLDevice::opencl_version_check()
-{
- string error;
- if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
- opencl_error(error);
- return false;
- }
- if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
- opencl_error(error);
- return false;
- }
- return true;
-}
-
-string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
-{
- MD5Hash md5;
- char version[256], driver[256], name[256], vendor[256];
-
- clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
- clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
- clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
- clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
-
- md5.append((uint8_t *)vendor, strlen(vendor));
- md5.append((uint8_t *)version, strlen(version));
- md5.append((uint8_t *)name, strlen(name));
- md5.append((uint8_t *)driver, strlen(driver));
-
- string options = kernel_build_options();
- options += kernel_custom_build_options;
- md5.append((uint8_t *)options.c_str(), options.size());
-
- return md5.get_hex();
-}
-
-bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
- VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
- /* Verify if device was initialized. */
- if (!device_initialized) {
- fprintf(stderr, "OpenCL: failed to initialize device.\n");
- return false;
- }
-
- /* Verify we have right opencl version. */
- if (!opencl_version_check())
- return false;
-
- load_required_kernels(requested_features);
-
- vector<OpenCLProgram *> programs;
- kernel_programs.load_kernels(programs, requested_features);
-
- if (!requested_features.use_baking && requested_features.use_denoising) {
- denoising_program = OpenCLProgram(
- this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
- denoising_program.add_kernel(ustring("filter_divide_shadow"));
- denoising_program.add_kernel(ustring("filter_get_feature"));
- denoising_program.add_kernel(ustring("filter_write_feature"));
- denoising_program.add_kernel(ustring("filter_detect_outliers"));
- denoising_program.add_kernel(ustring("filter_combine_halves"));
- denoising_program.add_kernel(ustring("filter_construct_transform"));
- denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
- denoising_program.add_kernel(ustring("filter_nlm_blur"));
- denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
- denoising_program.add_kernel(ustring("filter_nlm_update_output"));
- denoising_program.add_kernel(ustring("filter_nlm_normalize"));
- denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
- denoising_program.add_kernel(ustring("filter_finalize"));
- programs.push_back(&denoising_program);
- }
-
- load_required_kernel_task_pool.wait_work();
-
- /* Parallel compilation of Cycles kernels, this launches multiple
- * processes to workaround OpenCL frameworks serializing the calls
- * internally within a single process. */
- foreach (OpenCLProgram *program, programs) {
- if (!program->load()) {
- load_kernel_num_compiling++;
- load_kernel_task_pool.push([=] {
- program->compile();
- load_kernel_num_compiling--;
- });
- }
- }
- return true;
-}
-
-void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
-{
- vector<OpenCLProgram *> programs;
- base_program = OpenCLProgram(
- this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
- base_program.add_kernel(ustring("convert_to_byte"));
- base_program.add_kernel(ustring("convert_to_half_float"));
- base_program.add_kernel(ustring("zero_buffer"));
- programs.push_back(&base_program);
-
- if (requested_features.use_true_displacement) {
- displace_program = OpenCLProgram(
- this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
- displace_program.add_kernel(ustring("displace"));
- programs.push_back(&displace_program);
- }
-
- if (requested_features.use_background_light) {
- background_program = OpenCLProgram(this,
- "background",
- "kernel_background.cl",
- get_build_options(requested_features, "background"));
- background_program.add_kernel(ustring("background"));
- programs.push_back(&background_program);
- }
-
- if (requested_features.use_baking) {
- bake_program = OpenCLProgram(
- this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
- bake_program.add_kernel(ustring("bake"));
- programs.push_back(&bake_program);
- }
-
- foreach (OpenCLProgram *program, programs) {
- if (!program->load()) {
- load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
- }
- }
-}
-
-bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
-{
- if (requested_features.use_baking) {
- /* For baking, kernels have already been loaded in load_required_kernels(). */
- return true;
- }
-
- load_kernel_task_pool.wait_work();
- return split_kernel->load_kernels(requested_features);
-}
-
-OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
-{
- return &kernel_programs;
-}
-
-DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
-{
- return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-}
-
-void OpenCLDevice::mem_alloc(device_memory &mem)
-{
- if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
- }
-
- size_t size = mem.memory_size();
-
- /* check there is enough memory available for the allocation */
- cl_ulong max_alloc_size = 0;
- clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
-
- if (DebugFlags().opencl.mem_limit) {
- max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
- }
-
- if (size > max_alloc_size) {
- string error = "Scene too complex to fit in available memory.";
- if (mem.name != NULL) {
- error += string_printf(" (allocating buffer %s failed.)", mem.name);
- }
- set_error(error);
-
- return;
- }
-
- cl_mem_flags mem_flag;
- void *mem_ptr = NULL;
-
- if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
- mem_flag = CL_MEM_READ_ONLY;
- else
- mem_flag = CL_MEM_READ_WRITE;
-
- /* Zero-size allocation might be invoked by render, but not really
- * supported by OpenCL. Using NULL as device pointer also doesn't really
- * work for some reason, so for the time being we'll use special case
- * will null_mem buffer.
- */
- if (size != 0) {
- mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
- opencl_assert_err(ciErr, "clCreateBuffer");
- }
- else {
- mem.device_pointer = 0;
- }
-
- stats.mem_alloc(size);
- mem.device_size = size;
-}
-
-void OpenCLDevice::mem_copy_to(device_memory &mem)
-{
- if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- global_alloc(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- tex_alloc((device_texture &)mem);
- }
- else {
- if (!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- /* this is blocking */
- size_t size = mem.memory_size();
- if (size != 0) {
- opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
- CL_MEM_PTR(mem.device_pointer),
- CL_TRUE,
- 0,
- size,
- mem.host_pointer,
- 0,
- NULL,
- NULL));
- }
- }
-}
-
-void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
- size_t offset = elem * y * w;
- size_t size = elem * w * h;
- assert(size != 0);
- opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
- CL_MEM_PTR(mem.device_pointer),
- CL_TRUE,
- offset,
- size,
- (uchar *)mem.host_pointer + offset,
- 0,
- NULL,
- NULL));
-}
-
-void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
-{
- base_program.wait_for_availability();
- cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
- size_t global_size[] = {1024, 1024};
- size_t num_threads = global_size[0] * global_size[1];
-
- cl_mem d_buffer = CL_MEM_PTR(mem);
- cl_ulong d_offset = 0;
- cl_ulong d_size = 0;
-
- while (d_offset < size) {
- d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
-
- kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
- ciErr = clEnqueueNDRangeKernel(
- cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
- opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
- d_offset += d_size;
- }
-}
-
-void OpenCLDevice::mem_zero(device_memory &mem)
-{
- if (!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- if (mem.device_pointer) {
- if (base_program.is_loaded()) {
- mem_zero_kernel(mem.device_pointer, mem.memory_size());
- }
-
- if (mem.host_pointer) {
- memset(mem.host_pointer, 0, mem.memory_size());
- }
-
- if (!base_program.is_loaded()) {
- void *zero = mem.host_pointer;
-
- if (!mem.host_pointer) {
- zero = util_aligned_malloc(mem.memory_size(), 16);
- memset(zero, 0, mem.memory_size());
- }
-
- opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
- CL_MEM_PTR(mem.device_pointer),
- CL_TRUE,
- 0,
- mem.memory_size(),
- zero,
- 0,
- NULL,
- NULL));
-
- if (!mem.host_pointer) {
- util_aligned_free(zero);
- }
- }
- }
-}
-
-void OpenCLDevice::mem_free(device_memory &mem)
-{
- if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- }
- else {
- if (mem.device_pointer) {
- if (mem.device_pointer != 0) {
- opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
- }
- mem.device_pointer = 0;
-
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
- }
-}
-
-int OpenCLDevice::mem_sub_ptr_alignment()
-{
- return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
-}
-
-device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
-{
- cl_mem_flags mem_flag;
- if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
- mem_flag = CL_MEM_READ_ONLY;
- else
- mem_flag = CL_MEM_READ_WRITE;
-
- cl_buffer_region info;
- info.origin = mem.memory_elements_size(offset);
- info.size = mem.memory_elements_size(size);
-
- device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
- CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
- opencl_assert_err(ciErr, "clCreateSubBuffer");
- return sub_buf;
-}
-
-void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
-{
- if (device_pointer != 0) {
- opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
- }
-}
-
-void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
-{
- ConstMemMap::iterator i = const_mem_map.find(name);
- device_vector<uchar> *data;
-
- if (i == const_mem_map.end()) {
- data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
- data->alloc(size);
- const_mem_map.insert(ConstMemMap::value_type(name, data));
- }
- else {
- data = i->second;
- }
-
- memcpy(data->data(), host, size);
- data->copy_to_device();
-}
-
-void OpenCLDevice::global_alloc(device_memory &mem)
-{
- VLOG(1) << "Global memory allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- memory_manager.alloc(mem.name, mem);
- /* Set the pointer to non-null to keep code that inspects its value from thinking its
- * unallocated. */
- mem.device_pointer = 1;
- textures[mem.name] = &mem;
- textures_need_update = true;
-}
-
-void OpenCLDevice::global_free(device_memory &mem)
-{
- if (mem.device_pointer) {
- mem.device_pointer = 0;
-
- if (memory_manager.free(mem)) {
- textures_need_update = true;
- }
-
- foreach (TexturesMap::value_type &value, textures) {
- if (value.second == &mem) {
- textures.erase(value.first);
- break;
- }
- }
- }
-}
-
-void OpenCLDevice::tex_alloc(device_texture &mem)
-{
- VLOG(1) << "Texture allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- memory_manager.alloc(mem.name, mem);
- /* Set the pointer to non-null to keep code that inspects its value from thinking its
- * unallocated. */
- mem.device_pointer = 1;
- textures[mem.name] = &mem;
- textures_need_update = true;
-}
-
-void OpenCLDevice::tex_free(device_texture &mem)
-{
- global_free(mem);
-}
-
-size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
-{
- int r = global_size % group_size;
- return global_size + ((r == 0) ? 0 : group_size - r);
-}
-
-void OpenCLDevice::enqueue_kernel(
- cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
-{
- size_t workgroup_size, max_work_items[3];
-
- clGetKernelWorkGroupInfo(
- kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
- clGetDeviceInfo(
- cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
-
- if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
- workgroup_size = max_workgroup_size;
- }
-
- /* Try to divide evenly over 2 dimensions. */
- size_t local_size[2];
- if (x_workgroups) {
- local_size[0] = workgroup_size;
- local_size[1] = 1;
- }
- else {
- size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
- local_size[0] = local_size[1] = sqrt_workgroup_size;
- }
-
- /* Some implementations have max size 1 on 2nd dimension. */
- if (local_size[1] > max_work_items[1]) {
- local_size[0] = workgroup_size / max_work_items[1];
- local_size[1] = max_work_items[1];
- }
-
- size_t global_size[2] = {global_size_round_up(local_size[0], w),
- global_size_round_up(local_size[1], h)};
-
- /* Vertical size of 1 is coming from bake/shade kernels where we should
- * not round anything up because otherwise we'll either be doing too
- * much work per pixel (if we don't check global ID on Y axis) or will
- * be checking for global ID to always have Y of 0.
- */
- if (h == 1) {
- global_size[h] = 1;
- }
-
- /* run kernel */
- opencl_assert(
- clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
- opencl_assert(clFlush(cqCommandQueue));
-}
-
-void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
-{
- cl_mem ptr;
-
- MemMap::iterator i = mem_map.find(name);
- if (i != mem_map.end()) {
- ptr = CL_MEM_PTR(i->second);
- }
- else {
- ptr = 0;
- }
-
- opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
-}
-
-void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
- flush_texture_buffers();
-
- memory_manager.set_kernel_arg_buffers(kernel, narg);
-}
-
-void OpenCLDevice::flush_texture_buffers()
-{
- if (!textures_need_update) {
- return;
- }
- textures_need_update = false;
-
- /* Setup slots for textures. */
- int num_slots = 0;
-
- vector<texture_slot_t> texture_slots;
-
-# define KERNEL_TEX(type, name) \
- if (textures.find(#name) != textures.end()) { \
- texture_slots.push_back(texture_slot_t(#name, num_slots)); \
- } \
- num_slots++;
-# include "kernel/kernel_textures.h"
-
- int num_data_slots = num_slots;
-
- foreach (TexturesMap::value_type &tex, textures) {
- string name = tex.first;
- device_memory *mem = tex.second;
-
- if (mem->type == MEM_TEXTURE) {
- const uint id = ((device_texture *)mem)->slot;
- texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
- num_slots = max(num_slots, num_data_slots + id + 1);
- }
- }
-
- /* Realloc texture descriptors buffer. */
- memory_manager.free(texture_info);
- texture_info.resize(num_slots);
- memory_manager.alloc("texture_info", texture_info);
-
- /* Fill in descriptors */
- foreach (texture_slot_t &slot, texture_slots) {
- device_memory *mem = textures[slot.name];
- TextureInfo &info = texture_info[slot.slot];
-
- MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
-
- if (mem->type == MEM_TEXTURE) {
- info = ((device_texture *)mem)->info;
- }
- else {
- memset(&info, 0, sizeof(TextureInfo));
- }
-
- info.data = desc.offset;
- info.cl_buffer = desc.device_buffer;
- }
-
- /* Force write of descriptors. */
- memory_manager.free(texture_info);
- memory_manager.alloc("texture_info", texture_info);
-}
-
-void OpenCLDevice::thread_run(DeviceTask &task)
-{
- flush_texture_buffers();
-
- if (task.type == DeviceTask::RENDER) {
- RenderTile tile;
- DenoisingTask denoising(this, task);
-
- /* Allocate buffer for kernel globals */
- device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
- kgbuffer.alloc_to_device(1);
-
- /* Keep rendering tiles until done. */
- while (task.acquire_tile(this, tile, task.tile_types)) {
- if (tile.task == RenderTile::PATH_TRACE) {
- assert(tile.task == RenderTile::PATH_TRACE);
- scoped_timer timer(&tile.buffers->render_time);
-
- split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
-
- /* Complete kernel execution before release tile. */
- /* This helps in multi-device render;
- * The device that reaches the critical-section function
- * release_tile waits (stalling other devices from entering
- * release_tile) for all kernels to complete. If device1 (a
- * slow-render device) reaches release_tile first then it would
- * stall device2 (a fast-render device) from proceeding to render
- * next tile.
- */
- clFinish(cqCommandQueue);
- }
- else if (tile.task == RenderTile::BAKE) {
- bake(task, tile);
- }
- else if (tile.task == RenderTile::DENOISE) {
- tile.sample = tile.start_sample + tile.num_samples;
- denoise(tile, denoising);
- task.update_progress(&tile, tile.w * tile.h);
- }
-
- task.release_tile(tile);
- }
-
- kgbuffer.free();
- }
- else if (task.type == DeviceTask::SHADER) {
- shader(task);
- }
- else if (task.type == DeviceTask::FILM_CONVERT) {
- film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
- }
- else if (task.type == DeviceTask::DENOISE_BUFFER) {
- RenderTile tile;
- tile.x = task.x;
- tile.y = task.y;
- tile.w = task.w;
- tile.h = task.h;
- tile.buffer = task.buffer;
- tile.sample = task.sample + task.num_samples;
- tile.num_samples = task.num_samples;
- tile.start_sample = task.sample;
- tile.offset = task.offset;
- tile.stride = task.stride;
- tile.buffers = task.buffers;
-
- DenoisingTask denoising(this, task);
- denoise(tile, denoising);
- task.update_progress(&tile, tile.w * tile.h);
- }
-}
-
-void OpenCLDevice::film_convert(DeviceTask &task,
- device_ptr buffer,
- device_ptr rgba_byte,
- device_ptr rgba_half)
-{
- /* cast arguments to cl types */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
- cl_mem d_buffer = CL_MEM_PTR(buffer);
- cl_int d_x = task.x;
- cl_int d_y = task.y;
- cl_int d_w = task.w;
- cl_int d_h = task.h;
- cl_float d_sample_scale = 1.0f / (task.sample + 1);
- cl_int d_offset = task.offset;
- cl_int d_stride = task.stride;
-
- cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
- base_program(ustring("convert_to_half_float"));
-
- cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
-
- set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
-
- start_arg_index += kernel_set_args(ckFilmConvertKernel,
- start_arg_index,
- d_sample_scale,
- d_x,
- d_y,
- d_w,
- d_h,
- d_offset,
- d_stride);
-
- enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
-}
-
-bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task)
-{
- int stride = task->buffer.stride;
- int w = task->buffer.width;
- int h = task->buffer.h;
- int r = task->nlm_state.r;
- int f = task->nlm_state.f;
- float a = task->nlm_state.a;
- float k_2 = task->nlm_state.k_2;
-
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2 * r + 1) * (2 * r + 1);
- int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
- device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
- device_sub_ptr blurDifference(
- task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
- device_sub_ptr weightAccum(
- task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
- cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
- cl_mem difference_mem = CL_MEM_PTR(*difference);
- cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
- cl_mem image_mem = CL_MEM_PTR(image_ptr);
- cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
- cl_mem out_mem = CL_MEM_PTR(out_ptr);
- cl_mem scale_mem = NULL;
-
- mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
- mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
-
- cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
- cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
- cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
- cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
- cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
-
- kernel_set_args(ckNLMCalcDifference,
- 0,
- guide_mem,
- variance_mem,
- scale_mem,
- difference_mem,
- w,
- h,
- stride,
- pass_stride,
- r,
- channel_offset,
- 0,
- a,
- k_2);
- kernel_set_args(
- ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
- kernel_set_args(
- ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
- kernel_set_args(ckNLMUpdateOutput,
- 0,
- blurDifference_mem,
- image_mem,
- out_mem,
- weightAccum_mem,
- w,
- h,
- stride,
- pass_stride,
- channel_offset,
- r,
- f);
-
- enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
- enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
- enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
-
- kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
- enqueue_kernel(ckNLMNormalize, w, h);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
-{
- cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
- cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
- cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
- cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
- char use_time = task->buffer.use_time ? 1 : 0;
-
- cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
-
- int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
- cl_mem buffers[9];
- for (int i = 0; i < 9; i++) {
- buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
- arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
- }
- kernel_set_args(ckFilterConstructTransform,
- arg_ofs,
- transform_mem,
- rank_mem,
- task->filter_area,
- task->rect,
- task->buffer.pass_stride,
- task->buffer.frame_stride,
- use_time,
- task->radius,
- task->pca_threshold);
-
- enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task)
-{
- cl_mem color_mem = CL_MEM_PTR(color_ptr);
- cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
- cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
-
- cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
- cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
- cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
- cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
- cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
- cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
- cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
- cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
- cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
-
- int w = task->reconstruction_state.source_w;
- int h = task->reconstruction_state.source_h;
- int stride = task->buffer.stride;
- int frame_offset = frame * task->buffer.frame_stride;
- int t = task->tile_info->frames[frame];
- char use_time = task->buffer.use_time ? 1 : 0;
-
- int r = task->radius;
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2 * r + 1) * (2 * r + 1);
-
- device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
- device_sub_ptr blurDifference(
- task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
- cl_mem difference_mem = CL_MEM_PTR(*difference);
- cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
- kernel_set_args(ckNLMCalcDifference,
- 0,
- color_mem,
- color_variance_mem,
- scale_mem,
- difference_mem,
- w,
- h,
- stride,
- pass_stride,
- r,
- pass_stride,
- frame_offset,
- 1.0f,
- task->nlm_k_2);
- kernel_set_args(
- ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
- kernel_set_args(
- ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
- kernel_set_args(ckNLMConstructGramian,
- 0,
- t,
- blurDifference_mem,
- buffer_mem,
- transform_mem,
- rank_mem,
- XtWX_mem,
- XtWY_mem,
- task->reconstruction_state.filter_window,
- w,
- h,
- stride,
- pass_stride,
- r,
- 4,
- frame_offset,
- use_time);
-
- enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
- enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
- enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
- cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
-
- cl_mem output_mem = CL_MEM_PTR(output_ptr);
- cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
- cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
- cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
- int w = task->reconstruction_state.source_w;
- int h = task->reconstruction_state.source_h;
-
- kernel_set_args(ckFinalize,
- 0,
- output_mem,
- rank_mem,
- XtWX_mem,
- XtWY_mem,
- task->filter_area,
- task->reconstruction_state.buffer_params,
- task->render_buffer.samples);
- enqueue_kernel(ckFinalize, w, h);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect,
- DenoisingTask *task)
-{
- cl_mem a_mem = CL_MEM_PTR(a_ptr);
- cl_mem b_mem = CL_MEM_PTR(b_ptr);
- cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
- cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
-
- kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
- enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task)
-{
- cl_mem a_mem = CL_MEM_PTR(a_ptr);
- cl_mem b_mem = CL_MEM_PTR(b_ptr);
- cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
- cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
- cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
-
- cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
- cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
-
- int arg_ofs = kernel_set_args(
- ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
- cl_mem buffers[9];
- for (int i = 0; i < 9; i++) {
- buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
- arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
- }
- kernel_set_args(ckFilterDivideShadow,
- arg_ofs,
- a_mem,
- b_mem,
- sample_variance_mem,
- sv_variance_mem,
- buffer_variance_mem,
- task->rect,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task)
-{
- cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
- cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
- cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
-
- int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
- cl_mem buffers[9];
- for (int i = 0; i < 9; i++) {
- buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
- arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
- }
- kernel_set_args(ckFilterGetFeature,
- arg_ofs,
- mean_offset,
- variance_offset,
- mean_mem,
- variance_mem,
- scale,
- task->rect,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_write_feature(int out_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task)
-{
- cl_mem from_mem = CL_MEM_PTR(from_ptr);
- cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
-
- cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
-
- kernel_set_args(ckFilterWriteFeature,
- 0,
- task->render_buffer.samples,
- task->reconstruction_state.buffer_params,
- task->filter_area,
- from_mem,
- buffer_mem,
- out_offset,
- task->rect);
- enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
-{
- cl_mem image_mem = CL_MEM_PTR(image_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
- cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
- cl_mem output_mem = CL_MEM_PTR(output_ptr);
-
- cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
-
- kernel_set_args(ckFilterDetectOutliers,
- 0,
- image_mem,
- variance_mem,
- depth_mem,
- output_mem,
- task->rect,
- task->buffer.pass_stride);
- enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- return true;
-}
-
-void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
- denoising.functions.construct_transform = function_bind(
- &OpenCLDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(
- &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
- denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
- denoising.functions.divide_shadow = function_bind(
- &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.non_local_means = function_bind(
- &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
- denoising.functions.combine_halves = function_bind(
- &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(
- &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.write_feature = function_bind(
- &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
- denoising.functions.detect_outliers = function_bind(
- &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
- denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
- denoising.render_buffer.samples = rtile.sample;
- denoising.buffer.gpu_temporary_mem = true;
-
- denoising.run_denoising(rtile);
-}
-
-void OpenCLDevice::shader(DeviceTask &task)
-{
- /* cast arguments to cl types */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_input = CL_MEM_PTR(task.shader_input);
- cl_mem d_output = CL_MEM_PTR(task.shader_output);
- cl_int d_shader_eval_type = task.shader_eval_type;
- cl_int d_shader_filter = task.shader_filter;
- cl_int d_shader_x = task.shader_x;
- cl_int d_shader_w = task.shader_w;
- cl_int d_offset = task.offset;
-
- OpenCLDevice::OpenCLProgram *program = &background_program;
- if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
- program = &displace_program;
- }
- program->wait_for_availability();
- cl_kernel kernel = (*program)();
-
- cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
-
- set_kernel_arg_buffers(kernel, &start_arg_index);
-
- start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
- if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
- start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
- }
- start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
-
- for (int sample = 0; sample < task.num_samples; sample++) {
-
- if (task.get_cancel())
- break;
-
- kernel_set_args(kernel, start_arg_index, sample);
-
- enqueue_kernel(kernel, task.shader_w, 1);
-
- clFinish(cqCommandQueue);
-
- task.update_progress(NULL);
- }
-}
-
-void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
-{
- scoped_timer timer(&rtile.buffers->render_time);
-
- /* Cast arguments to cl types. */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
- cl_int d_x = rtile.x;
- cl_int d_y = rtile.y;
- cl_int d_w = rtile.w;
- cl_int d_h = rtile.h;
- cl_int d_offset = rtile.offset;
- cl_int d_stride = rtile.stride;
-
- bake_program.wait_for_availability();
- cl_kernel kernel = bake_program();
-
- cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
-
- set_kernel_arg_buffers(kernel, &start_arg_index);
-
- start_arg_index += kernel_set_args(
- kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
-
- int start_sample = rtile.start_sample;
- int end_sample = rtile.start_sample + rtile.num_samples;
-
- for (int sample = start_sample; sample < end_sample; sample++) {
- if (task.get_cancel()) {
- if (task.need_finish_queue == false)
- break;
- }
-
- kernel_set_args(kernel, start_arg_index, sample);
-
- enqueue_kernel(kernel, d_w, d_h);
- clFinish(cqCommandQueue);
-
- rtile.sample = sample + 1;
-
- task.update_progress(&rtile, rtile.w * rtile.h);
- }
-}
-
-static bool kernel_build_opencl_2(cl_device_id cdDevice)
-{
- /* Build with OpenCL 2.0 if available, this improves performance
- * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
- * Note that OpenCL selects the highest 1.x version by default,
- * only for 2.0 do we need the explicit compiler flag. */
- int version_major, version_minor;
- if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
- if (version_major >= 2) {
- /* This appears to trigger a driver bug in Radeon RX cards with certain
- * driver version, so don't use OpenCL 2.0 for those. */
- string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
- if (string_startswith(device_name, "Radeon RX 4") ||
- string_startswith(device_name, "Radeon (TM) RX 4") ||
- string_startswith(device_name, "Radeon RX 5") ||
- string_startswith(device_name, "Radeon (TM) RX 5")) {
- char version[256] = "";
- int driver_major, driver_minor;
- clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
- if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
- return !(driver_major == 3075 && driver_minor <= 12);
- }
- }
-
- return true;
- }
- }
-
- return false;
-}
-
-string OpenCLDevice::kernel_build_options(const string *debug_src)
-{
- string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
- if (kernel_build_opencl_2(cdDevice)) {
- build_options += "-cl-std=CL2.0 ";
- }
-
- if (platform_name == "NVIDIA CUDA") {
- build_options +=
- "-D__KERNEL_OPENCL_NVIDIA__ "
- "-cl-nv-maxrregcount=32 "
- "-cl-nv-verbose ";
-
- uint compute_capability_major, compute_capability_minor;
- clGetDeviceInfo(cdDevice,
- CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
- sizeof(cl_uint),
- &compute_capability_major,
- NULL);
- clGetDeviceInfo(cdDevice,
- CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
- sizeof(cl_uint),
- &compute_capability_minor,
- NULL);
-
- build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
- compute_capability_major * 100 + compute_capability_minor * 10);
- }
-
- else if (platform_name == "Apple")
- build_options += "-D__KERNEL_OPENCL_APPLE__ ";
-
- else if (platform_name == "AMD Accelerated Parallel Processing")
- build_options += "-D__KERNEL_OPENCL_AMD__ ";
-
- else if (platform_name == "Intel(R) OpenCL") {
- build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
-
- /* Options for gdb source level kernel debugging.
- * this segfaults on linux currently.
- */
- if (OpenCLInfo::use_debug() && debug_src)
- build_options += "-g -s \"" + *debug_src + "\" ";
- }
-
- if (info.has_half_images) {
- build_options += "-D__KERNEL_CL_KHR_FP16__ ";
- }
-
- if (OpenCLInfo::use_debug()) {
- build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
- }
-
-# ifdef WITH_NANOVDB
- if (info.has_nanovdb) {
- build_options += "-DWITH_NANOVDB ";
- }
-# endif
-
- return build_options;
-}
-
-/* TODO(sergey): In the future we can use variadic templates, once
- * C++0x is allowed. Should allow to clean this up a bit.
- */
-int OpenCLDevice::kernel_set_args(cl_kernel kernel,
- int start_argument_index,
- const ArgumentWrapper &arg1,
- const ArgumentWrapper &arg2,
- const ArgumentWrapper &arg3,
- const ArgumentWrapper &arg4,
- const ArgumentWrapper &arg5,
- const ArgumentWrapper &arg6,
- const ArgumentWrapper &arg7,
- const ArgumentWrapper &arg8,
- const ArgumentWrapper &arg9,
- const ArgumentWrapper &arg10,
- const ArgumentWrapper &arg11,
- const ArgumentWrapper &arg12,
- const ArgumentWrapper &arg13,
- const ArgumentWrapper &arg14,
- const ArgumentWrapper &arg15,
- const ArgumentWrapper &arg16,
- const ArgumentWrapper &arg17,
- const ArgumentWrapper &arg18,
- const ArgumentWrapper &arg19,
- const ArgumentWrapper &arg20,
- const ArgumentWrapper &arg21,
- const ArgumentWrapper &arg22,
- const ArgumentWrapper &arg23,
- const ArgumentWrapper &arg24,
- const ArgumentWrapper &arg25,
- const ArgumentWrapper &arg26,
- const ArgumentWrapper &arg27,
- const ArgumentWrapper &arg28,
- const ArgumentWrapper &arg29,
- const ArgumentWrapper &arg30,
- const ArgumentWrapper &arg31,
- const ArgumentWrapper &arg32,
- const ArgumentWrapper &arg33)
-{
- int current_arg_index = 0;
-# define FAKE_VARARG_HANDLE_ARG(arg) \
- do { \
- if (arg.pointer != NULL) { \
- opencl_assert(clSetKernelArg( \
- kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
- ++current_arg_index; \
- } \
- else { \
- return current_arg_index; \
- } \
- } while (false)
- FAKE_VARARG_HANDLE_ARG(arg1);
- FAKE_VARARG_HANDLE_ARG(arg2);
- FAKE_VARARG_HANDLE_ARG(arg3);
- FAKE_VARARG_HANDLE_ARG(arg4);
- FAKE_VARARG_HANDLE_ARG(arg5);
- FAKE_VARARG_HANDLE_ARG(arg6);
- FAKE_VARARG_HANDLE_ARG(arg7);
- FAKE_VARARG_HANDLE_ARG(arg8);
- FAKE_VARARG_HANDLE_ARG(arg9);
- FAKE_VARARG_HANDLE_ARG(arg10);
- FAKE_VARARG_HANDLE_ARG(arg11);
- FAKE_VARARG_HANDLE_ARG(arg12);
- FAKE_VARARG_HANDLE_ARG(arg13);
- FAKE_VARARG_HANDLE_ARG(arg14);
- FAKE_VARARG_HANDLE_ARG(arg15);
- FAKE_VARARG_HANDLE_ARG(arg16);
- FAKE_VARARG_HANDLE_ARG(arg17);
- FAKE_VARARG_HANDLE_ARG(arg18);
- FAKE_VARARG_HANDLE_ARG(arg19);
- FAKE_VARARG_HANDLE_ARG(arg20);
- FAKE_VARARG_HANDLE_ARG(arg21);
- FAKE_VARARG_HANDLE_ARG(arg22);
- FAKE_VARARG_HANDLE_ARG(arg23);
- FAKE_VARARG_HANDLE_ARG(arg24);
- FAKE_VARARG_HANDLE_ARG(arg25);
- FAKE_VARARG_HANDLE_ARG(arg26);
- FAKE_VARARG_HANDLE_ARG(arg27);
- FAKE_VARARG_HANDLE_ARG(arg28);
- FAKE_VARARG_HANDLE_ARG(arg29);
- FAKE_VARARG_HANDLE_ARG(arg30);
- FAKE_VARARG_HANDLE_ARG(arg31);
- FAKE_VARARG_HANDLE_ARG(arg32);
- FAKE_VARARG_HANDLE_ARG(arg33);
-# undef FAKE_VARARG_HANDLE_ARG
- return current_arg_index;
-}
-
-void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
-{
- if (kernel) {
- clReleaseKernel(kernel);
- }
-}
-
-void OpenCLDevice::release_mem_object_safe(cl_mem mem)
-{
- if (mem != NULL) {
- clReleaseMemObject(mem);
- }
-}
-
-void OpenCLDevice::release_program_safe(cl_program program)
-{
- if (program) {
- clReleaseProgram(program);
- }
-}
-
-/* ** Those guys are for working around some compiler-specific bugs ** */
-
-cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
-{
- return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
-}
-
-void OpenCLDevice::store_cached_kernel(cl_program program,
- ustring key,
- thread_scoped_lock &cache_locker)
-{
- OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
-}
-
-Device *opencl_create_split_device(DeviceInfo &info,
- Stats &stats,
- Profiler &profiler,
- bool background)
-{
- return new OpenCLDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
deleted file mode 100644
index 4330e07cb37..00000000000
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-# include "util/util_foreach.h"
-
-# include "device/opencl/device_opencl.h"
-# include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation)
-{
- allocations.push_back(&allocation);
-}
-
-void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device)
-{
- bool need_realloc = false;
-
- /* Calculate total size and remove any freed. */
- size_t total_size = 0;
-
- for (int i = allocations.size() - 1; i >= 0; i--) {
- Allocation *allocation = allocations[i];
-
- /* Remove allocations that have been freed. */
- if (!allocation->mem || allocation->mem->memory_size() == 0) {
- allocation->device_buffer = NULL;
- allocation->size = 0;
-
- allocations.erase(allocations.begin() + i);
-
- need_realloc = true;
-
- continue;
- }
-
- /* Get actual size for allocation. */
- size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
-
- if (allocation->size != alloc_size) {
- /* Allocation is either new or resized. */
- allocation->size = alloc_size;
- allocation->needs_copy_to_device = true;
-
- need_realloc = true;
- }
-
- total_size += alloc_size;
- }
-
- /* Always allocate non-empty buffer, NULL pointers cause problems with some drivers. */
- total_size = std::max(total_size, (size_t)16);
-
- if (need_realloc) {
- cl_ulong max_buffer_size;
- clGetDeviceInfo(
- device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
- if (total_size > max_buffer_size) {
- device->set_error("Scene too complex to fit in available memory.");
- return;
- }
-
- device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device,
- "memory manager buffer");
-
- new_buffer->alloc_to_device(total_size);
-
- size_t offset = 0;
-
- foreach (Allocation *allocation, allocations) {
- if (allocation->needs_copy_to_device) {
- /* Copy from host to device. */
- opencl_device_assert(device,
- clEnqueueWriteBuffer(device->cqCommandQueue,
- CL_MEM_PTR(new_buffer->device_pointer),
- CL_FALSE,
- offset,
- allocation->mem->memory_size(),
- allocation->mem->host_pointer,
- 0,
- NULL,
- NULL));
-
- allocation->needs_copy_to_device = false;
- }
- else {
- /* Fast copy from memory already on device. */
- opencl_device_assert(device,
- clEnqueueCopyBuffer(device->cqCommandQueue,
- CL_MEM_PTR(buffer->device_pointer),
- CL_MEM_PTR(new_buffer->device_pointer),
- allocation->desc.offset,
- offset,
- allocation->mem->memory_size(),
- 0,
- NULL,
- NULL));
- }
-
- allocation->desc.offset = offset;
- offset += allocation->size;
- }
-
- delete buffer;
-
- buffer = new_buffer;
- }
- else {
- assert(total_size == buffer->data_size);
-
- size_t offset = 0;
-
- foreach (Allocation *allocation, allocations) {
- if (allocation->needs_copy_to_device) {
- /* Copy from host to device. */
- opencl_device_assert(device,
- clEnqueueWriteBuffer(device->cqCommandQueue,
- CL_MEM_PTR(buffer->device_pointer),
- CL_FALSE,
- offset,
- allocation->mem->memory_size(),
- allocation->mem->host_pointer,
- 0,
- NULL,
- NULL));
-
- allocation->needs_copy_to_device = false;
- }
-
- offset += allocation->size;
- }
- }
-
- /* Not really necessary, but seems to improve responsiveness for some reason. */
- clFinish(device->cqCommandQueue);
-}
-
-void MemoryManager::DeviceBuffer::free(OpenCLDevice *)
-{
- buffer->free();
-}
-
-MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer()
-{
- DeviceBuffer *smallest = device_buffers;
-
- foreach (DeviceBuffer &device_buffer, device_buffers) {
- if (device_buffer.size < smallest->size) {
- smallest = &device_buffer;
- }
- }
-
- return smallest;
-}
-
-MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false)
-{
- foreach (DeviceBuffer &device_buffer, device_buffers) {
- device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer");
- }
-}
-
-void MemoryManager::free()
-{
- foreach (DeviceBuffer &device_buffer, device_buffers) {
- device_buffer.free(device);
- }
-}
-
-void MemoryManager::alloc(const char *name, device_memory &mem)
-{
- Allocation &allocation = allocations[name];
-
- allocation.mem = &mem;
- allocation.needs_copy_to_device = true;
-
- if (!allocation.device_buffer) {
- DeviceBuffer *device_buffer = smallest_device_buffer();
- allocation.device_buffer = device_buffer;
-
- allocation.desc.device_buffer = device_buffer - device_buffers;
-
- device_buffer->add_allocation(allocation);
-
- device_buffer->size += mem.memory_size();
- }
-
- need_update = true;
-}
-
-bool MemoryManager::free(device_memory &mem)
-{
- foreach (AllocationsMap::value_type &value, allocations) {
- Allocation &allocation = value.second;
- if (allocation.mem == &mem) {
-
- allocation.device_buffer->size -= mem.memory_size();
-
- allocation.mem = NULL;
- allocation.needs_copy_to_device = false;
-
- need_update = true;
- return true;
- }
- }
-
- return false;
-}
-
-MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
-{
- update_device_memory();
-
- Allocation &allocation = allocations[name];
- return allocation.desc;
-}
-
-void MemoryManager::update_device_memory()
-{
- if (!need_update) {
- return;
- }
-
- need_update = false;
-
- foreach (DeviceBuffer &device_buffer, device_buffers) {
- device_buffer.update_device_memory(device);
- }
-}
-
-void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
- update_device_memory();
-
- foreach (DeviceBuffer &device_buffer, device_buffers) {
- if (device_buffer.buffer->device_pointer) {
- device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
- }
- else {
- device->kernel_set_args(kernel, (*narg)++);
- }
- }
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
deleted file mode 100644
index 23624f837a6..00000000000
--- a/intern/cycles/device/opencl/memory_manager.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "device/device.h"
-
-#include "util/util_map.h"
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-#include "clew.h"
-
-CCL_NAMESPACE_BEGIN
-
-class OpenCLDevice;
-
-class MemoryManager {
- public:
- static const int NUM_DEVICE_BUFFERS = 8;
-
- struct BufferDescriptor {
- uint device_buffer;
- cl_ulong offset;
- };
-
- private:
- struct DeviceBuffer;
-
- struct Allocation {
- device_memory *mem;
-
- DeviceBuffer *device_buffer;
- size_t size; /* Size of actual allocation, may be larger than requested. */
-
- BufferDescriptor desc;
-
- bool needs_copy_to_device;
-
- Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
- {
- }
- };
-
- struct DeviceBuffer {
- device_only_memory<uchar> *buffer;
- vector<Allocation *> allocations;
- size_t size; /* Size of all allocations. */
-
- DeviceBuffer() : buffer(NULL), size(0)
- {
- }
-
- ~DeviceBuffer()
- {
- delete buffer;
- buffer = NULL;
- }
-
- void add_allocation(Allocation &allocation);
-
- void update_device_memory(OpenCLDevice *device);
-
- void free(OpenCLDevice *device);
- };
-
- OpenCLDevice *device;
-
- DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
-
- typedef unordered_map<string, Allocation> AllocationsMap;
- AllocationsMap allocations;
-
- bool need_update;
-
- DeviceBuffer *smallest_device_buffer();
-
- public:
- MemoryManager(OpenCLDevice *device);
-
- void free(); /* Free all memory. */
-
- void alloc(const char *name, device_memory &mem);
- bool free(device_memory &mem);
-
- BufferDescriptor get_descriptor(string name);
-
- void update_device_memory();
- void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-};
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
deleted file mode 100644
index 3929cf77f15..00000000000
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ /dev/null
@@ -1,1326 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-# include "device/device_intern.h"
-# include "device/opencl/device_opencl.h"
-
-# include "util/util_debug.h"
-# include "util/util_logging.h"
-# include "util/util_md5.h"
-# include "util/util_path.h"
-# include "util/util_semaphore.h"
-# include "util/util_system.h"
-# include "util/util_time.h"
-
-using std::cerr;
-using std::endl;
-
-CCL_NAMESPACE_BEGIN
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs)
- : program(rhs.program), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
-{
- delete mutex;
-}
-
-OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL)
-{
-}
-
-OpenCLCache::Slot::Slot(const Slot &rhs)
- : context_mutex(NULL), context(NULL), programs(rhs.programs)
-{
-}
-
-OpenCLCache::Slot::~Slot()
-{
- delete context_mutex;
-}
-
-OpenCLCache &OpenCLCache::global_instance()
-{
- static OpenCLCache instance;
- return instance;
-}
-
-cl_context OpenCLCache::get_context(cl_platform_id platform,
- cl_device_id device,
- thread_scoped_lock &slot_locker)
-{
- assert(platform != NULL);
-
- OpenCLCache &self = global_instance();
-
- thread_scoped_lock cache_lock(self.cache_lock);
-
- pair<CacheMap::iterator, bool> ins = self.cache.insert(
- CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
- Slot &slot = ins.first->second;
-
- /* create slot lock only while holding cache lock */
- if (!slot.context_mutex)
- slot.context_mutex = new thread_mutex;
-
- /* need to unlock cache before locking slot, to allow store to complete */
- cache_lock.unlock();
-
- /* lock the slot */
- slot_locker = thread_scoped_lock(*slot.context_mutex);
-
- /* If the thing isn't cached */
- if (slot.context == NULL) {
- /* return with the caller's lock holder holding the slot lock */
- return NULL;
- }
-
- /* the item was already cached, release the slot lock */
- slot_locker.unlock();
-
- cl_int ciErr = clRetainContext(slot.context);
- assert(ciErr == CL_SUCCESS);
- (void)ciErr;
-
- return slot.context;
-}
-
-cl_program OpenCLCache::get_program(cl_platform_id platform,
- cl_device_id device,
- ustring key,
- thread_scoped_lock &slot_locker)
-{
- assert(platform != NULL);
-
- OpenCLCache &self = global_instance();
-
- thread_scoped_lock cache_lock(self.cache_lock);
-
- pair<CacheMap::iterator, bool> ins = self.cache.insert(
- CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
- Slot &slot = ins.first->second;
-
- pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert(
- Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
-
- Slot::ProgramEntry &entry = ins2.first->second;
-
- /* create slot lock only while holding cache lock */
- if (!entry.mutex)
- entry.mutex = new thread_mutex;
-
- /* need to unlock cache before locking slot, to allow store to complete */
- cache_lock.unlock();
-
- /* lock the slot */
- slot_locker = thread_scoped_lock(*entry.mutex);
-
- /* If the thing isn't cached */
- if (entry.program == NULL) {
- /* return with the caller's lock holder holding the slot lock */
- return NULL;
- }
-
- /* the item was already cached, release the slot lock */
- slot_locker.unlock();
-
- cl_int ciErr = clRetainProgram(entry.program);
- assert(ciErr == CL_SUCCESS);
- (void)ciErr;
-
- return entry.program;
-}
-
-void OpenCLCache::store_context(cl_platform_id platform,
- cl_device_id device,
- cl_context context,
- thread_scoped_lock &slot_locker)
-{
- assert(platform != NULL);
- assert(device != NULL);
- assert(context != NULL);
-
- OpenCLCache &self = global_instance();
-
- thread_scoped_lock cache_lock(self.cache_lock);
- CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
- cache_lock.unlock();
-
- Slot &slot = i->second;
-
- /* sanity check */
- assert(i != self.cache.end());
- assert(slot.context == NULL);
-
- slot.context = context;
-
- /* unlock the slot */
- slot_locker.unlock();
-
- /* increment reference count in OpenCL.
- * The caller is going to release the object when done with it. */
- cl_int ciErr = clRetainContext(context);
- assert(ciErr == CL_SUCCESS);
- (void)ciErr;
-}
-
-void OpenCLCache::store_program(cl_platform_id platform,
- cl_device_id device,
- cl_program program,
- ustring key,
- thread_scoped_lock &slot_locker)
-{
- assert(platform != NULL);
- assert(device != NULL);
- assert(program != NULL);
-
- OpenCLCache &self = global_instance();
-
- thread_scoped_lock cache_lock(self.cache_lock);
-
- CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
- assert(i != self.cache.end());
- Slot &slot = i->second;
-
- Slot::EntryMap::iterator i2 = slot.programs.find(key);
- assert(i2 != slot.programs.end());
- Slot::ProgramEntry &entry = i2->second;
-
- assert(entry.program == NULL);
-
- cache_lock.unlock();
-
- entry.program = program;
-
- /* unlock the slot */
- slot_locker.unlock();
-
- /* Increment reference count in OpenCL.
- * The caller is going to release the object when done with it.
- */
- cl_int ciErr = clRetainProgram(program);
- assert(ciErr == CL_SUCCESS);
- (void)ciErr;
-}
-
-string OpenCLCache::get_kernel_md5()
-{
- OpenCLCache &self = global_instance();
- thread_scoped_lock lock(self.kernel_md5_lock);
-
- if (self.kernel_md5.empty()) {
- self.kernel_md5 = path_files_md5_hash(path_get("source"));
- }
- return self.kernel_md5;
-}
-
-static string get_program_source(const string &kernel_file)
-{
- string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
- /* We compile kernels consisting of many files. unfortunately OpenCL
- * kernel caches do not seem to recognize changes in included files.
- * so we force recompile on changes by adding the md5 hash of all files.
- */
- source = path_source_replace_includes(source, path_get("source"));
- source += "\n// " + util_md5_string(source) + "\n";
- return source;
-}
-
-OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
- const string &program_name,
- const string &kernel_file,
- const string &kernel_build_options,
- bool use_stdout)
- : device(device),
- program_name(program_name),
- kernel_file(kernel_file),
- kernel_build_options(kernel_build_options),
- use_stdout(use_stdout)
-{
- loaded = false;
- needs_compiling = true;
- program = NULL;
-}
-
-OpenCLDevice::OpenCLProgram::~OpenCLProgram()
-{
- release();
-}
-
-void OpenCLDevice::OpenCLProgram::release()
-{
- for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
- ++kernel) {
- if (kernel->second) {
- clReleaseKernel(kernel->second);
- kernel->second = NULL;
- }
- }
- if (program) {
- clReleaseProgram(program);
- program = NULL;
- }
-}
-
-void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug)
-{
- if (!use_stdout) {
- log += msg + "\n";
- }
- else if (!debug) {
- printf("%s\n", msg.c_str());
- fflush(stdout);
- }
- else {
- VLOG(2) << msg;
- }
-}
-
-void OpenCLDevice::OpenCLProgram::add_error(const string &msg)
-{
- if (use_stdout) {
- fprintf(stderr, "%s\n", msg.c_str());
- }
- if (error_msg == "") {
- error_msg += "\n";
- }
- error_msg += msg;
-}
-
-void OpenCLDevice::OpenCLProgram::add_kernel(ustring name)
-{
- if (!kernels.count(name)) {
- kernels[name] = NULL;
- }
-}
-
-bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src)
-{
- string build_options;
- build_options = device->kernel_build_options(debug_src) + kernel_build_options;
-
- VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'.";
- cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
- /* show warnings even if build is successful */
- size_t ret_val_size = 0;
-
- clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-
- if (ciErr != CL_SUCCESS) {
- add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) +
- ", errors in console.");
- }
-
- if (ret_val_size > 1) {
- vector<char> build_log(ret_val_size + 1);
- clGetProgramBuildInfo(
- program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
-
- build_log[ret_val_size] = '\0';
- /* Skip meaningless empty output from the NVidia compiler. */
- if (!(ret_val_size == 2 && build_log[0] == '\n')) {
- add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]),
- ciErr == CL_SUCCESS);
- }
- }
-
- return (ciErr == CL_SUCCESS);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src)
-{
- string source = get_program_source(kernel_file);
-
- if (debug_src) {
- path_write_text(*debug_src, source);
- }
-
- size_t source_len = source.size();
- const char *source_str = source.c_str();
- cl_int ciErr;
-
- program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr);
-
- if (ciErr != CL_SUCCESS) {
- add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
- return false;
- }
-
- double starttime = time_dt();
- add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
- add_log(string("Build flags: ") + kernel_build_options, true);
-
- if (!build_kernel(debug_src))
- return false;
-
- double elapsed = time_dt() - starttime;
- add_log(
- string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
- false);
-
- return true;
-}
-
-static void escape_python_string(string &str)
-{
- /* Escape string to be passed as a Python raw string with '' quotes'. */
- string_replace(str, "'", "\'");
-}
-
-static int opencl_compile_process_limit()
-{
- /* Limit number of concurrent processes compiling, with a heuristic based
- * on total physical RAM and estimate of memory usage needed when compiling
- * with all Cycles features enabled.
- *
- * This is somewhat arbitrary as we don't know the actual available RAM or
- * how much the kernel compilation will needed depending on the features, but
- * better than not limiting at all. */
- static const int64_t GB = 1024LL * 1024LL * 1024LL;
- static const int64_t process_memory = 2 * GB;
- static const int64_t base_memory = 2 * GB;
- static const int64_t system_memory = system_physical_ram();
- static const int64_t process_limit = (system_memory - base_memory) / process_memory;
-
- return max((int)process_limit, 1);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
-{
- /* Construct arguments. */
- vector<string> args;
- args.push_back("--background");
- args.push_back("--factory-startup");
- args.push_back("--python-expr");
-
- int device_platform_id = device->device_num;
- string device_name = device->device_name;
- string platform_name = device->platform_name;
- string build_options = device->kernel_build_options(NULL) + kernel_build_options;
- string kernel_file_escaped = kernel_file;
- string clbin_escaped = clbin;
-
- escape_python_string(device_name);
- escape_python_string(platform_name);
- escape_python_string(build_options);
- escape_python_string(kernel_file_escaped);
- escape_python_string(clbin_escaped);
-
- args.push_back(string_printf(
- "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
- device_platform_id,
- device_name.c_str(),
- platform_name.c_str(),
- build_options.c_str(),
- kernel_file_escaped.c_str(),
- clbin_escaped.c_str()));
-
- /* Limit number of concurrent processes compiling. */
- static thread_counting_semaphore semaphore(opencl_compile_process_limit());
- semaphore.acquire();
-
- /* Compile. */
- const double starttime = time_dt();
- add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
- add_log(string("Build flags: ") + kernel_build_options, true);
- const bool success = system_call_self(args);
- const double elapsed = time_dt() - starttime;
-
- semaphore.release();
-
- if (!success || !path_exists(clbin)) {
- return false;
- }
-
- add_log(
- string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
- false);
-
- return load_binary(clbin);
-}
-
-/* Compile opencl kernel. This method is called from the _cycles Python
- * module compile kernels. Parameters must match function above. */
-bool device_opencl_compile_kernel(const vector<string> &parameters)
-{
- int device_platform_id = std::stoi(parameters[0]);
- const string &device_name = parameters[1];
- const string &platform_name = parameters[2];
- const string &build_options = parameters[3];
- const string &kernel_file = parameters[4];
- const string &binary_path = parameters[5];
-
- if (clewInit() != CLEW_SUCCESS) {
- return false;
- }
-
- vector<OpenCLPlatformDevice> usable_devices;
- OpenCLInfo::get_usable_devices(&usable_devices);
- if (device_platform_id >= usable_devices.size()) {
- return false;
- }
-
- OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id];
- if (platform_device.platform_name != platform_name ||
- platform_device.device_name != device_name) {
- return false;
- }
-
- cl_platform_id platform = platform_device.platform_id;
- cl_device_id device = platform_device.device_id;
- const cl_context_properties context_props[] = {
- CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0};
-
- cl_int err;
- cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
- if (err != CL_SUCCESS) {
- return false;
- }
-
- string source = get_program_source(kernel_file);
- size_t source_len = source.size();
- const char *source_str = source.c_str();
- cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
- bool result = false;
-
- if (err == CL_SUCCESS) {
- err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
- if (err == CL_SUCCESS) {
- size_t size = 0;
- clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
- if (size > 0) {
- vector<uint8_t> binary(size);
- uint8_t *bytes = &binary[0];
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
- result = path_write_binary(binary_path, binary);
- }
- }
- clReleaseProgram(program);
- }
-
- clReleaseContext(context);
-
- return result;
-}
-
-bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src)
-{
- /* read binary into memory */
- vector<uint8_t> binary;
-
- if (!path_read_binary(clbin, binary)) {
- add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
- return false;
- }
-
- /* create program */
- cl_int status, ciErr;
- size_t size = binary.size();
- const uint8_t *bytes = &binary[0];
-
- program = clCreateProgramWithBinary(
- device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr);
-
- if (status != CL_SUCCESS || ciErr != CL_SUCCESS) {
- add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " +
- clewErrorString(status) + " " + clewErrorString(ciErr));
- return false;
- }
-
- if (!build_kernel(debug_src))
- return false;
-
- return true;
-}
-
-bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin)
-{
- size_t size = 0;
- clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-
- if (!size)
- return false;
-
- vector<uint8_t> binary(size);
- uint8_t *bytes = &binary[0];
-
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-
- return path_write_binary(clbin, binary);
-}
-
-bool OpenCLDevice::OpenCLProgram::load()
-{
- loaded = false;
- string device_md5 = device->device_md5_hash(kernel_build_options);
-
- /* Try to use cached kernel. */
- thread_scoped_lock cache_locker;
- ustring cache_key(program_name + device_md5);
- program = device->load_cached_kernel(cache_key, cache_locker);
- if (!program) {
- add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
- /* need to create source to get md5 */
- string source = get_program_source(kernel_file);
-
- string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
- util_md5_string(source);
- basename = path_cache_get(path_join("kernels", basename));
- string clbin = basename + ".clbin";
-
- /* If binary kernel exists already, try use it. */
- if (path_exists(clbin) && load_binary(clbin)) {
- /* Kernel loaded from binary, nothing to do. */
- add_log(string("Loaded program from ") + clbin + ".", true);
-
- /* Cache the program. */
- device->store_cached_kernel(program, cache_key, cache_locker);
- }
- else {
- add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
- cache_locker.unlock();
- }
- }
-
- if (program) {
- create_kernels();
- loaded = true;
- needs_compiling = false;
- }
-
- return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::compile()
-{
- assert(device);
-
- string device_md5 = device->device_md5_hash(kernel_build_options);
-
- /* Try to use cached kernel. */
- thread_scoped_lock cache_locker;
- ustring cache_key(program_name + device_md5);
- program = device->load_cached_kernel(cache_key, cache_locker);
-
- if (!program) {
-
- add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
- /* need to create source to get md5 */
- string source = get_program_source(kernel_file);
-
- string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
- util_md5_string(source);
- basename = path_cache_get(path_join("kernels", basename));
- string clbin = basename + ".clbin";
-
- /* path to preprocessed source for debugging */
- string clsrc, *debug_src = NULL;
-
- if (OpenCLInfo::use_debug()) {
- clsrc = basename + ".cl";
- debug_src = &clsrc;
- }
-
- if (DebugFlags().running_inside_blender && compile_separate(clbin)) {
- add_log(string("Built and loaded program from ") + clbin + ".", true);
- loaded = true;
- }
- else {
- if (DebugFlags().running_inside_blender) {
- add_log(string("Separate-process building of ") + clbin +
- " failed, will fall back to regular building.",
- true);
- }
-
- /* If does not exist or loading binary failed, compile kernel. */
- if (!compile_kernel(debug_src)) {
- needs_compiling = false;
- return;
- }
-
- /* Save binary for reuse. */
- if (!save_binary(clbin)) {
- add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
- }
- }
-
- /* Cache the program. */
- device->store_cached_kernel(program, cache_key, cache_locker);
- }
-
- create_kernels();
- needs_compiling = false;
- loaded = true;
-}
-
-void OpenCLDevice::OpenCLProgram::create_kernels()
-{
- for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
- ++kernel) {
- assert(kernel->second == NULL);
- cl_int ciErr;
- string name = "kernel_ocl_" + kernel->first.string();
- kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
- if (device->opencl_error(ciErr)) {
- add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " +
- clewErrorString(ciErr));
- return;
- }
- }
-}
-
-bool OpenCLDevice::OpenCLProgram::wait_for_availability()
-{
- add_log(string("Waiting for availability of ") + program_name + ".", true);
- while (needs_compiling) {
- time_sleep(0.1);
- }
- return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::report_error()
-{
- /* If loaded is true, there was no error. */
- if (loaded)
- return;
- /* if use_stdout is true, the error was already reported. */
- if (use_stdout)
- return;
-
- cerr << error_msg << endl;
- if (!compile_output.empty()) {
- cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
- cerr << compile_output << endl;
- }
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()()
-{
- assert(kernels.size() == 1);
- return kernels.begin()->second;
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name)
-{
- assert(kernels.count(name));
- return kernels[name];
-}
-
-cl_device_type OpenCLInfo::device_type()
-{
- switch (DebugFlags().opencl.device_type) {
- case DebugFlags::OpenCL::DEVICE_NONE:
- return 0;
- case DebugFlags::OpenCL::DEVICE_ALL:
- return CL_DEVICE_TYPE_ALL;
- case DebugFlags::OpenCL::DEVICE_DEFAULT:
- return CL_DEVICE_TYPE_DEFAULT;
- case DebugFlags::OpenCL::DEVICE_CPU:
- return CL_DEVICE_TYPE_CPU;
- case DebugFlags::OpenCL::DEVICE_GPU:
- return CL_DEVICE_TYPE_GPU;
- case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
- return CL_DEVICE_TYPE_ACCELERATOR;
- default:
- return CL_DEVICE_TYPE_ALL;
- }
-}
-
-bool OpenCLInfo::use_debug()
-{
- return DebugFlags().opencl.debug;
-}
-
-bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id)
-{
- cl_device_type device_type;
- if (!get_device_type(device_id, &device_type)) {
- return false;
- }
- string device_name;
- if (!get_device_name(device_id, &device_name)) {
- return false;
- }
-
- int driver_major = 0;
- int driver_minor = 0;
- if (!get_driver_version(device_id, &driver_major, &driver_minor)) {
- return false;
- }
- VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
-
- if (getenv("CYCLES_OPENCL_TEST")) {
- return true;
- }
-
- /* Allow Intel GPUs on Intel OpenCL platform. */
- if (platform_name.find("Intel") != string::npos) {
- if (device_type != CL_DEVICE_TYPE_GPU) {
- /* OpenCL on Intel CPU is not an officially supported configuration.
- * Use hybrid CPU+GPU rendering to utilize both GPU and CPU. */
- return false;
- }
-
-# ifdef __APPLE__
- /* Apple uses own framework, which can also put Iris onto AMD frame-work.
- * This isn't supported configuration. */
- return false;
-# else
- if (device_name.find("Iris") != string::npos || device_name.find("Xe") != string::npos) {
- return true;
- }
-# endif
- }
-
- if (platform_name == "AMD Accelerated Parallel Processing" &&
- device_type == CL_DEVICE_TYPE_GPU) {
- if (driver_major < 2236) {
- VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
- return false;
- }
- const char *blacklist[] = {/* GCN 1 */
- "Tahiti",
- "Pitcairn",
- "Capeverde",
- "Oland",
- "Hainan",
- NULL};
- for (int i = 0; blacklist[i] != NULL; i++) {
- if (device_name == blacklist[i]) {
- VLOG(1) << "AMD device " << device_name << " not supported";
- return false;
- }
- }
- return true;
- }
- if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
- return false;
- }
- return false;
-}
-
-bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
-{
- const int req_major = 1, req_minor = 1;
- int major, minor;
- char version[256];
- clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
- if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
- if (error != NULL) {
- *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
- }
- return false;
- }
- if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
- if (error != NULL) {
- *error = string_printf(
- "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
- }
- return false;
- }
- if (error != NULL) {
- *error = "";
- }
- return true;
-}
-
-bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error)
-{
- char version[256];
- clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
- if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) {
- if (error != NULL) {
- *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
- }
- return false;
- }
- if (error != NULL) {
- *error = "";
- }
- return true;
-}
-
-bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
-{
- const int req_major = 1, req_minor = 1;
- int major, minor;
- if (!get_device_version(device, &major, &minor, error)) {
- return false;
- }
-
- if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
- if (error != NULL) {
- *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
- }
- return false;
- }
- if (error != NULL) {
- *error = "";
- }
- return true;
-}
-
-string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id)
-{
- if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
- /* Use cl_amd_device_topology extension. */
- cl_char topology[24];
- if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS &&
- topology[0] == 1) {
- return string_printf("%02x:%02x.%01x",
- (unsigned int)topology[21],
- (unsigned int)topology[22],
- (unsigned int)topology[23]);
- }
- }
- else if (platform_name == "NVIDIA CUDA") {
- /* Use two undocumented options of the cl_nv_device_attribute_query extension. */
- cl_int bus_id, slot_id;
- if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS &&
- clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
- return string_printf("%02x:%02x.%01x",
- (unsigned int)(bus_id),
- (unsigned int)(slot_id >> 3),
- (unsigned int)(slot_id & 0x7));
- }
- }
- /* No general way to get a hardware ID from OpenCL => give up. */
- return "";
-}
-
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
-{
- const cl_device_type device_type = OpenCLInfo::device_type();
- static bool first_time = true;
-# define FIRST_VLOG(severity) \
- if (first_time) \
- VLOG(severity)
-
- usable_devices->clear();
-
- if (device_type == 0) {
- FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
- first_time = false;
- return;
- }
-
- cl_int error;
- vector<cl_device_id> device_ids;
- vector<cl_platform_id> platform_ids;
-
- /* Get platforms. */
- if (!get_platforms(&platform_ids, &error)) {
- FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error));
- first_time = false;
- return;
- }
- if (platform_ids.size() == 0) {
- FIRST_VLOG(2) << "No OpenCL platforms were found.";
- first_time = false;
- return;
- }
- /* Devices are numbered consecutively across platforms. */
- for (int platform = 0; platform < platform_ids.size(); platform++) {
- cl_platform_id platform_id = platform_ids[platform];
- string platform_name;
- if (!get_platform_name(platform_id, &platform_name)) {
- FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
- continue;
- }
- FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << ".";
- if (!platform_version_check(platform_id)) {
- FIRST_VLOG(2) << "Ignoring platform " << platform_name
- << " due to too old compiler version.";
- continue;
- }
- if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) {
- FIRST_VLOG(2) << "Ignoring platform " << platform_name
- << ", failed to fetch of devices: " << string(clewErrorString(error));
- continue;
- }
- if (device_ids.size() == 0) {
- FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices.";
- continue;
- }
- for (int num = 0; num < device_ids.size(); num++) {
- const cl_device_id device_id = device_ids[num];
- string device_name;
- if (!get_device_name(device_id, &device_name, &error)) {
- FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error))
- << ", ignoring.";
- continue;
- }
- if (!device_version_check(device_id)) {
- FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
- continue;
- }
- if (device_supported(platform_name, device_id)) {
- cl_device_type device_type;
- if (!get_device_type(device_id, &device_type, &error)) {
- FIRST_VLOG(2) << "Ignoring device " << device_name
- << ", failed to fetch device type:" << string(clewErrorString(error));
- continue;
- }
- string readable_device_name = get_readable_device_name(device_id);
- if (readable_device_name != device_name) {
- FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name;
- }
- FIRST_VLOG(2) << "Adding new device " << readable_device_name << ".";
- string hardware_id = get_hardware_id(platform_name, device_id);
- string device_extensions = get_device_extensions(device_id);
- usable_devices->push_back(OpenCLPlatformDevice(platform_id,
- platform_name,
- device_id,
- device_type,
- readable_device_name,
- hardware_id,
- device_extensions));
- }
- else {
- FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet.";
- }
- }
- }
- first_time = false;
-}
-
-bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error)
-{
- /* Reset from possible previous state. */
- platform_ids->resize(0);
- cl_uint num_platforms;
- if (!get_num_platforms(&num_platforms, error)) {
- return false;
- }
- /* Get actual platforms. */
- cl_int err;
- platform_ids->resize(num_platforms);
- if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
-}
-
-vector<cl_platform_id> OpenCLInfo::get_platforms()
-{
- vector<cl_platform_id> platform_ids;
- get_platforms(&platform_ids);
- return platform_ids;
-}
-
-bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
-{
- cl_int err;
- if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *num_platforms = 0;
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
-}
-
-cl_uint OpenCLInfo::get_num_platforms()
-{
- cl_uint num_platforms;
- if (!get_num_platforms(&num_platforms)) {
- return 0;
- }
- return num_platforms;
-}
-
-bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name)
-{
- char buffer[256];
- if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) !=
- CL_SUCCESS) {
- *platform_name = "";
- return false;
- }
- *platform_name = buffer;
- return true;
-}
-
-string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
-{
- string platform_name;
- if (!get_platform_name(platform_id, &platform_name)) {
- return "";
- }
- return platform_name;
-}
-
-bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type,
- cl_uint *num_devices,
- cl_int *error)
-{
- cl_int err;
- if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *num_devices = 0;
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
-}
-
-cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type)
-{
- cl_uint num_devices;
- if (!get_num_platform_devices(platform_id, device_type, &num_devices)) {
- return 0;
- }
- return num_devices;
-}
-
-bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type,
- vector<cl_device_id> *device_ids,
- cl_int *error)
-{
- /* Reset from possible previous state. */
- device_ids->resize(0);
- /* Get number of devices to pre-allocate memory. */
- cl_uint num_devices;
- if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) {
- return false;
- }
- /* Get actual device list. */
- device_ids->resize(num_devices);
- cl_int err;
- if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) !=
- CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
-}
-
-vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type)
-{
- vector<cl_device_id> devices;
- get_platform_devices(platform_id, device_type, &devices);
- return devices;
-}
-
-bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error)
-{
- char buffer[1024];
- cl_int err;
- if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) !=
- CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *device_name = "";
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- *device_name = buffer;
- return true;
-}
-
-string OpenCLInfo::get_device_name(cl_device_id device_id)
-{
- string device_name;
- if (!get_device_name(device_id, &device_name)) {
- return "";
- }
- return device_name;
-}
-
-bool OpenCLInfo::get_device_extensions(cl_device_id device_id,
- string *device_extensions,
- cl_int *error)
-{
- size_t extension_length = 0;
- cl_int err;
- /* Determine the size of the extension string. */
- if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &extension_length)) !=
- CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *device_extensions = "";
- return false;
- }
- vector<char> buffer(extension_length);
- if ((err = clGetDeviceInfo(
- device_id, CL_DEVICE_EXTENSIONS, extension_length, buffer.data(), NULL)) !=
- CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *device_extensions = "";
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- *device_extensions = string(buffer.data());
- return true;
-}
-
-string OpenCLInfo::get_device_extensions(cl_device_id device_id)
-{
- string device_extensions;
- if (!get_device_extensions(device_id, &device_extensions)) {
- return "";
- }
- return device_extensions;
-}
-
-bool OpenCLInfo::get_device_type(cl_device_id device_id,
- cl_device_type *device_type,
- cl_int *error)
-{
- cl_int err;
- if ((err = clGetDeviceInfo(
- device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *device_type = 0;
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
-}
-
-cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
-{
- cl_device_type device_type;
- if (!get_device_type(device_id, &device_type)) {
- return 0;
- }
- return device_type;
-}
-
-string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
-{
- string name = "";
- char board_name[1024];
- size_t length = 0;
- if (clGetDeviceInfo(
- device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) ==
- CL_SUCCESS) {
- if (length != 0 && board_name[0] != '\0') {
- name = board_name;
- }
- }
-
- /* Fallback to standard device name API. */
- if (name.empty()) {
- name = get_device_name(device_id);
- }
-
- /* Special exception for AMD Vega, need to be able to tell
- * Vega 56 from 64 apart.
- */
- if (name == "Radeon RX Vega") {
- cl_int max_compute_units = 0;
- if (clGetDeviceInfo(device_id,
- CL_DEVICE_MAX_COMPUTE_UNITS,
- sizeof(max_compute_units),
- &max_compute_units,
- NULL) == CL_SUCCESS) {
- name += " " + to_string(max_compute_units);
- }
- }
-
- /* Distinguish from our native CPU device. */
- if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
- name += " (OpenCL)";
- }
-
- return name;
-}
-
-bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error)
-{
- char buffer[1024];
- cl_int err;
- if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) !=
- CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- if (sscanf(buffer, "%d.%d", major, minor) < 2) {
- VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
- return false;
- }
- return true;
-}
-
-int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id)
-{
- int base_align_bits;
- if (clGetDeviceInfo(
- device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) ==
- CL_SUCCESS) {
- return base_align_bits / 8;
- }
- return 1;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
new file mode 100644
index 00000000000..13f23bd229a
--- /dev/null
+++ b/intern/cycles/device/optix/device.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/optix/device.h"
+
+#include "device/cuda/device.h"
+#include "device/optix/device_impl.h"
+#include "util/util_logging.h"
+
+#ifdef WITH_OPTIX
+# include <optix_function_table_definition.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+bool device_optix_init()
+{
+#ifdef WITH_OPTIX
+ if (g_optixFunctionTable.optixDeviceContextCreate != NULL) {
+ /* Already initialized function table. */
+ return true;
+ }
+
+ /* Need to initialize CUDA as well. */
+ if (!device_cuda_init()) {
+ return false;
+ }
+
+ const OptixResult result = optixInit();
+
+ if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
+ VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
+ "Please update to the latest driver first!";
+ return false;
+ }
+ else if (result != OPTIX_SUCCESS) {
+ VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+ return false;
+ }
+
+ /* Loaded OptiX successfully! */
+ return true;
+#else
+ return false;
+#endif
+}
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
+{
+#ifdef WITH_OPTIX
+ devices.reserve(cuda_devices.size());
+
+ /* Simply add all supported CUDA devices as OptiX devices again. */
+ for (DeviceInfo info : cuda_devices) {
+ assert(info.type == DEVICE_CUDA);
+
+ int major;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
+ if (major < 5) {
+ /* Only Maxwell and up are supported by OptiX. */
+ continue;
+ }
+
+ info.type = DEVICE_OPTIX;
+ info.id += "_OptiX";
+ info.denoisers |= DENOISER_OPTIX;
+
+ devices.push_back(info);
+ }
+#else
+ (void)cuda_devices;
+ (void)devices;
+#endif
+}
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_OPTIX
+ return new OptiXDevice(info, stats, profiler);
+#else
+ (void)info;
+ (void)stats;
+ (void)profiler;
+
+ LOG(FATAL) << "Request to create OptiX device without compiled-in support. Should never happen.";
+
+ return nullptr;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device.h b/intern/cycles/device/optix/device.h
new file mode 100644
index 00000000000..29fa729c2e4
--- /dev/null
+++ b/intern/cycles/device/optix/device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_optix_init();
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
new file mode 100644
index 00000000000..b54d423a183
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -0,0 +1,1573 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+# include "device/optix/device_impl.h"
+
+# include "bvh/bvh.h"
+# include "bvh/bvh_optix.h"
+# include "integrator/pass_accessor_gpu.h"
+# include "render/buffers.h"
+# include "render/hair.h"
+# include "render/mesh.h"
+# include "render/object.h"
+# include "render/pass.h"
+# include "render/scene.h"
+
+# include "util/util_debug.h"
+# include "util/util_logging.h"
+# include "util/util_md5.h"
+# include "util/util_path.h"
+# include "util/util_progress.h"
+# include "util/util_time.h"
+
+# undef __KERNEL_CPU__
+# define __KERNEL_OPTIX__
+# include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
+ : device(device), queue(device), state(device, "__denoiser_state")
+{
+}
+
+OptiXDevice::Denoiser::~Denoiser()
+{
+ const CUDAContextScope scope(device);
+ if (optix_denoiser != nullptr) {
+ optixDenoiserDestroy(optix_denoiser);
+ }
+}
+
+OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+ : CUDADevice(info, stats, profiler),
+ sbt_data(this, "__sbt", MEM_READ_ONLY),
+ launch_params(this, "__params"),
+ denoiser_(this)
+{
+ /* Make the CUDA context current. */
+ if (!cuContext) {
+ /* Do not initialize if CUDA context creation failed already. */
+ return;
+ }
+ const CUDAContextScope scope(this);
+
+ /* Create OptiX context for this device. */
+ OptixDeviceContextOptions options = {};
+# ifdef WITH_CYCLES_LOGGING
+ options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
+ options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
+ switch (level) {
+ case 1:
+ LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+ break;
+ case 2:
+ LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+ break;
+ case 3:
+ LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+ break;
+ case 4:
+ LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+ break;
+ }
+ };
+# endif
+ if (DebugFlags().optix.use_debug) {
+ options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
+ }
+ optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
+# ifdef WITH_CYCLES_LOGGING
+ optix_assert(optixDeviceContextSetLogCallback(
+ context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+# endif
+
+ /* Fix weird compiler bug that assigns wrong size. */
+ launch_params.data_elements = sizeof(KernelParamsOptiX);
+
+ /* Allocate launch parameter buffer memory on device. */
+ launch_params.alloc_to_device(1);
+}
+
+OptiXDevice::~OptiXDevice()
+{
+ /* Make CUDA context current. */
+ const CUDAContextScope scope(this);
+
+ free_bvh_memory_delayed();
+
+ sbt_data.free();
+ texture_info.free();
+ launch_params.free();
+
+ /* Unload modules. */
+ if (optix_module != NULL) {
+ optixModuleDestroy(optix_module);
+ }
+ for (unsigned int i = 0; i < 2; ++i) {
+ if (builtin_modules[i] != NULL) {
+ optixModuleDestroy(builtin_modules[i]);
+ }
+ }
+ for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+ if (pipelines[i] != NULL) {
+ optixPipelineDestroy(pipelines[i]);
+ }
+ }
+
+ optixDeviceContextDestroy(context);
+}
+
+unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
+{
+ return make_unique<OptiXDeviceQueue>(this);
+}
+
+BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
+{
+ /* OptiX has its own internal acceleration structure format. */
+ return BVH_LAYOUT_OPTIX;
+}
+
+string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+ string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
+
+ /* Add OptiX SDK include directory to include paths. */
+ const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
+ if (optix_sdk_path) {
+ common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
+ }
+
+ /* Specialization for shader raytracing. */
+ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+ common_cflags += " --keep-device-functions";
+ }
+
+ return common_cflags;
+}
+
+bool OptiXDevice::load_kernels(const uint kernel_features)
+{
+ if (have_error()) {
+ /* Abort early if context creation failed already. */
+ return false;
+ }
+
+ /* Load CUDA modules because we need some of the utility kernels. */
+ if (!CUDADevice::load_kernels(kernel_features)) {
+ return false;
+ }
+
+ /* Skip creating OptiX module if only doing denoising. */
+ if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) {
+ return true;
+ }
+
+ const CUDAContextScope scope(this);
+
+ /* Unload existing OptiX module and pipelines first. */
+ if (optix_module != NULL) {
+ optixModuleDestroy(optix_module);
+ optix_module = NULL;
+ }
+ for (unsigned int i = 0; i < 2; ++i) {
+ if (builtin_modules[i] != NULL) {
+ optixModuleDestroy(builtin_modules[i]);
+ builtin_modules[i] = NULL;
+ }
+ }
+ for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+ if (pipelines[i] != NULL) {
+ optixPipelineDestroy(pipelines[i]);
+ pipelines[i] = NULL;
+ }
+ }
+
+ OptixModuleCompileOptions module_options = {};
+ module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
+
+ if (DebugFlags().optix.use_debug) {
+ module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+ module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+ }
+ else {
+ module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+ module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+ }
+
+ module_options.boundValues = nullptr;
+ module_options.numBoundValues = 0;
+
+ OptixPipelineCompileOptions pipeline_options = {};
+ /* Default to no motion blur and two-level graph, since it is the fastest option. */
+ pipeline_options.usesMotionBlur = false;
+ pipeline_options.traversableGraphFlags =
+ OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+ pipeline_options.numPayloadValues = 6;
+ pipeline_options.numAttributeValues = 2; /* u, v */
+ pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+ pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */
+
+ pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+ if (kernel_features & KERNEL_FEATURE_HAIR) {
+ if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+ pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+ }
+ else
+ pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+ }
+
+ /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
+ * This is necessary since objects may be reported to have motion if the Vector pass is
+ * active, but may still need to be rendered without motion blur if that isn't active as well. */
+ motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
+
+ if (motion_blur) {
+ pipeline_options.usesMotionBlur = true;
+ /* Motion blur can insert motion transforms into the traversal graph.
+ * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
+ pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+ }
+
+ { /* Load and compile PTX module with OptiX kernels. */
+ string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
+ "lib/kernel_optix_shader_raytrace.ptx" :
+ "lib/kernel_optix.ptx");
+ if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+ if (!getenv("OPTIX_ROOT_DIR")) {
+ set_error(
+ "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
+ "the Optix SDK to be able to compile Optix kernels on demand).");
+ return false;
+ }
+ ptx_filename = compile_kernel(
+ kernel_features,
+ (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel",
+ "optix",
+ true);
+ }
+ if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
+ set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
+ return false;
+ }
+
+ const OptixResult result = optixModuleCreateFromPTX(context,
+ &module_options,
+ &pipeline_options,
+ ptx_data.data(),
+ ptx_data.size(),
+ nullptr,
+ 0,
+ &optix_module);
+ if (result != OPTIX_SUCCESS) {
+ set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
+ ptx_filename.c_str(),
+ optixGetErrorName(result)));
+ return false;
+ }
+ }
+
+ /* Create program groups. */
+ OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+ OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+ OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+ group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_closest";
+ group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_shadow";
+ group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_subsurface";
+ group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_volume_stack";
+ group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+ group_descs[PG_MISS].miss.module = optix_module;
+ group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+ group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+ group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+ group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+ if (kernel_features & KERNEL_FEATURE_HAIR) {
+ if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+ /* Built-in thick curve intersection. */
+ OptixBuiltinISOptions builtin_options = {};
+ builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+ builtin_options.usesMotionBlur = false;
+
+ optix_assert(optixBuiltinISModuleGet(
+ context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+ group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+ group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+ group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+ group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+ if (motion_blur) {
+ builtin_options.usesMotionBlur = true;
+
+ optix_assert(optixBuiltinISModuleGet(
+ context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+ group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+ group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+ group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+ group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+ }
+ }
+ else {
+ /* Custom ribbon intersection. */
+ group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+ group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+ group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+ }
+ }
+
+ if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
+ /* Add hit group for local intersections. */
+ group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+ }
+
+ /* Shader raytracing replaces some functions with direct callables. */
+ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+ group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
+ group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_shade_surface_raytrace";
+ group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+ group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+ "__direct_callable__svm_node_bevel";
+ group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass";
+ }
+
+ optix_assert(optixProgramGroupCreate(
+ context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+ /* Get program stack sizes. */
+ OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+ /* Set up SBT, which in this case is used only to select between different programs. */
+ sbt_data.alloc(NUM_PROGRAM_GROUPS);
+ memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+ for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+ optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+ }
+ sbt_data.copy_to_device(); /* Upload SBT to device. */
+
+ /* Calculate maximum trace continuation stack size. */
+ unsigned int trace_css = stack_size[PG_HITD].cssCH;
+ /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
+ trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+ trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+ trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+ trace_css = std::max(trace_css,
+ stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+ trace_css = std::max(trace_css,
+ stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+
+ OptixPipelineLinkOptions link_options = {};
+ link_options.maxTraceDepth = 1;
+
+ if (DebugFlags().optix.use_debug) {
+ link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+ }
+ else {
+ link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+ }
+
+ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+ /* Create shader raytracing pipeline. */
+ vector<OptixProgramGroup> pipeline_groups;
+ pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+ pipeline_groups.push_back(groups[PG_MISS]);
+ pipeline_groups.push_back(groups[PG_HITD]);
+ pipeline_groups.push_back(groups[PG_HITS]);
+ pipeline_groups.push_back(groups[PG_HITL]);
+ if (motion_blur) {
+ pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+ pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+ }
+ pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+ pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+
+ optix_assert(optixPipelineCreate(context,
+ &pipeline_options,
+ &link_options,
+ pipeline_groups.data(),
+ pipeline_groups.size(),
+ nullptr,
+ 0,
+ &pipelines[PIP_SHADE_RAYTRACE]));
+
+ /* Combine ray generation and trace continuation stack size. */
+ const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+ link_options.maxTraceDepth * trace_css;
+ const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
+ stack_size[PG_CALL_SVM_BEVEL].dssDC);
+
+ /* Set stack size depending on pipeline options. */
+ optix_assert(optixPipelineSetStackSize(
+ pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+ }
+
+ { /* Create intersection-only pipeline. */
+ vector<OptixProgramGroup> pipeline_groups;
+ pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
+ pipeline_groups.push_back(groups[PG_MISS]);
+ pipeline_groups.push_back(groups[PG_HITD]);
+ pipeline_groups.push_back(groups[PG_HITS]);
+ pipeline_groups.push_back(groups[PG_HITL]);
+ if (motion_blur) {
+ pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+ pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+ }
+
+ optix_assert(optixPipelineCreate(context,
+ &pipeline_options,
+ &link_options,
+ pipeline_groups.data(),
+ pipeline_groups.size(),
+ nullptr,
+ 0,
+ &pipelines[PIP_INTERSECT]));
+
+ /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+ const unsigned int css =
+ std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+ std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+ std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+ stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+ link_options.maxTraceDepth * trace_css;
+
+ optix_assert(
+ optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+ }
+
+ /* Clean up program group objects. */
+ for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ optixProgramGroupDestroy(groups[i]);
+ }
+
+ return true;
+}
+
+/* --------------------------------------------------------------------
+ * Buffer denoising.
+ */
+
+class OptiXDevice::DenoiseContext {
+ public:
+ explicit DenoiseContext(OptiXDevice *device, const DeviceDenoiseTask &task)
+ : denoise_params(task.params),
+ render_buffers(task.render_buffers),
+ buffer_params(task.buffer_params),
+ guiding_buffer(device, "denoiser guiding passes buffer"),
+ num_samples(task.num_samples)
+ {
+ num_input_passes = 1;
+ if (denoise_params.use_pass_albedo) {
+ num_input_passes += 1;
+ use_pass_albedo = true;
+ pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO);
+ if (denoise_params.use_pass_normal) {
+ num_input_passes += 1;
+ use_pass_normal = true;
+ pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL);
+ }
+ }
+
+ const int num_guiding_passes = num_input_passes - 1;
+
+ if (num_guiding_passes) {
+ if (task.allow_inplace_modification) {
+ guiding_params.device_pointer = render_buffers->buffer.device_pointer;
+
+ guiding_params.pass_albedo = pass_denoising_albedo;
+ guiding_params.pass_normal = pass_denoising_normal;
+
+ guiding_params.stride = buffer_params.stride;
+ guiding_params.pass_stride = buffer_params.pass_stride;
+ }
+ else {
+ guiding_params.pass_stride = 0;
+ if (use_pass_albedo) {
+ guiding_params.pass_albedo = guiding_params.pass_stride;
+ guiding_params.pass_stride += 3;
+ }
+ if (use_pass_normal) {
+ guiding_params.pass_normal = guiding_params.pass_stride;
+ guiding_params.pass_stride += 3;
+ }
+
+ guiding_params.stride = buffer_params.width;
+
+ guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height *
+ guiding_params.pass_stride);
+ guiding_params.device_pointer = guiding_buffer.device_pointer;
+ }
+ }
+
+ pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+ }
+
+ const DenoiseParams &denoise_params;
+
+ RenderBuffers *render_buffers = nullptr;
+ const BufferParams &buffer_params;
+
+ /* Device-side storage of the guiding passes. */
+ device_only_memory<float> guiding_buffer;
+
+ struct {
+ device_ptr device_pointer = 0;
+
+ /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+ int pass_albedo = PASS_UNUSED;
+ int pass_normal = PASS_UNUSED;
+
+ int stride = -1;
+ int pass_stride = -1;
+ } guiding_params;
+
+ /* Number of input passes. Including the color and extra auxiliary passes. */
+ int num_input_passes = 0;
+ bool use_pass_albedo = false;
+ bool use_pass_normal = false;
+
+ int num_samples = 0;
+
+ int pass_sample_count = PASS_UNUSED;
+
+ /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+ int pass_denoising_albedo = PASS_UNUSED;
+ int pass_denoising_normal = PASS_UNUSED;
+
+ /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+ * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+ * the fake values and denoising of passes which do need albedo can no longer happen. */
+ bool albedo_replaced_with_fake = false;
+};
+
+class OptiXDevice::DenoisePass {
+ public:
+ DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type)
+ {
+ noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY);
+ denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED);
+
+ const PassInfo pass_info = Pass::get_info(type);
+ num_components = pass_info.num_components;
+ use_compositing = pass_info.use_compositing;
+ use_denoising_albedo = pass_info.use_denoising_albedo;
+ }
+
+ PassType type;
+
+ int noisy_offset;
+ int denoised_offset;
+
+ int num_components;
+ bool use_compositing;
+ bool use_denoising_albedo;
+};
+
+bool OptiXDevice::denoise_buffer(const DeviceDenoiseTask &task)
+{
+ const CUDAContextScope scope(this);
+
+ DenoiseContext context(this, task);
+
+ if (!denoise_ensure(context)) {
+ return false;
+ }
+
+ if (!denoise_filter_guiding_preprocess(context)) {
+ LOG(ERROR) << "Error preprocessing guiding passes.";
+ return false;
+ }
+
+ /* Passes which will use real albedo when it is available. */
+ denoise_pass(context, PASS_COMBINED);
+ denoise_pass(context, PASS_SHADOW_CATCHER_MATTE);
+
+ /* Passes which do not need albedo and hence if real is present it needs to become fake. */
+ denoise_pass(context, PASS_SHADOW_CATCHER);
+
+ return true;
+}
+
+DeviceQueue *OptiXDevice::get_denoise_queue()
+{
+ return &denoiser_.queue;
+}
+
+bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+ const_cast<int *>(&context.guiding_params.pass_stride),
+ const_cast<int *>(&context.guiding_params.pass_albedo),
+ const_cast<int *>(&context.guiding_params.pass_normal),
+ &context.render_buffers->buffer.device_pointer,
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&buffer_params.pass_stride),
+ const_cast<int *>(&context.pass_sample_count),
+ const_cast<int *>(&context.pass_denoising_albedo),
+ const_cast<int *>(&context.pass_denoising_normal),
+ const_cast<int *>(&buffer_params.full_x),
+ const_cast<int *>(&buffer_params.full_y),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height),
+ const_cast<int *>(&context.num_samples)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+ const_cast<int *>(&context.guiding_params.pass_stride),
+ const_cast<int *>(&context.guiding_params.pass_albedo),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
+}
+
+void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const DenoisePass pass(pass_type, buffer_params);
+
+ if (pass.noisy_offset == PASS_UNUSED) {
+ return;
+ }
+ if (pass.denoised_offset == PASS_UNUSED) {
+ LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+ return;
+ }
+
+ if (pass.use_denoising_albedo) {
+ if (context.albedo_replaced_with_fake) {
+ LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+ return;
+ }
+ }
+ else if (!context.albedo_replaced_with_fake) {
+ context.albedo_replaced_with_fake = true;
+ if (!denoise_filter_guiding_set_fake_albedo(context)) {
+ LOG(ERROR) << "Error replacing real albedo with the fake one.";
+ return;
+ }
+ }
+
+ /* Read and preprocess noisy color input pass. */
+ denoise_color_read(context, pass);
+ if (!denoise_filter_color_preprocess(context, pass)) {
+ LOG(ERROR) << "Error connverting denoising passes to RGB buffer.";
+ return;
+ }
+
+ if (!denoise_run(context, pass)) {
+ LOG(ERROR) << "Error running OptiX denoiser.";
+ return;
+ }
+
+ /* Store result in the combined pass of the render buffer.
+ *
+ * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */
+ if (!denoise_filter_color_postprocess(context, pass)) {
+ LOG(ERROR) << "Error copying denoiser result to the denoised pass.";
+ return;
+ }
+
+ denoiser_.queue.synchronize();
+}
+
+void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass &pass)
+{
+ PassAccessor::PassAccessInfo pass_access_info;
+ pass_access_info.type = pass.type;
+ pass_access_info.mode = PassMode::NOISY;
+ pass_access_info.offset = pass.noisy_offset;
+
+ /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+ * on the approximation. The latter is not even possible because OptiX does not support
+ * denoising of semi-transparent pixels. */
+ pass_access_info.use_approximate_shadow_catcher = false;
+ pass_access_info.use_approximate_shadow_catcher_background = false;
+ pass_access_info.show_active_pixels = false;
+
+ /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases.
+ */
+ const PassAccessorGPU pass_accessor(
+ &denoiser_.queue, pass_access_info, 1.0f, context.num_samples);
+
+ PassAccessor::Destination destination(pass_access_info.type);
+ destination.d_pixels = context.render_buffers->buffer.device_pointer +
+ pass.denoised_offset * sizeof(float);
+ destination.num_components = 3;
+ destination.pixel_stride = context.buffer_params.pass_stride;
+
+ pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination);
+}
+
+bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {&context.render_buffers->buffer.device_pointer,
+ const_cast<int *>(&buffer_params.full_x),
+ const_cast<int *>(&buffer_params.full_y),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height),
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&buffer_params.pass_stride),
+ const_cast<int *>(&pass.denoised_offset)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context,
+ const DenoisePass &pass)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {&context.render_buffers->buffer.device_pointer,
+ const_cast<int *>(&buffer_params.full_x),
+ const_cast<int *>(&buffer_params.full_y),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height),
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&buffer_params.pass_stride),
+ const_cast<int *>(&context.num_samples),
+ const_cast<int *>(&pass.noisy_offset),
+ const_cast<int *>(&pass.denoised_offset),
+ const_cast<int *>(&context.pass_sample_count),
+ const_cast<int *>(&pass.num_components),
+ const_cast<bool *>(&pass.use_compositing)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_ensure(DenoiseContext &context)
+{
+ if (!denoise_create_if_needed(context)) {
+ LOG(ERROR) << "OptiX denoiser creation has failed.";
+ return false;
+ }
+
+ if (!denoise_configure_if_needed(context)) {
+ LOG(ERROR) << "OptiX denoiser configuration has failed.";
+ return false;
+ }
+
+ return true;
+}
+
+bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
+{
+ const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) ||
+ (denoiser_.use_pass_albedo != context.use_pass_albedo) ||
+ (denoiser_.use_pass_normal != context.use_pass_normal);
+ if (!recreate_denoiser) {
+ return true;
+ }
+
+ /* Destroy existing handle before creating new one. */
+ if (denoiser_.optix_denoiser) {
+ optixDenoiserDestroy(denoiser_.optix_denoiser);
+ }
+
+ /* Create OptiX denoiser handle on demand when it is first used. */
+ OptixDenoiserOptions denoiser_options = {};
+ denoiser_options.guideAlbedo = context.use_pass_albedo;
+ denoiser_options.guideNormal = context.use_pass_normal;
+ const OptixResult result = optixDenoiserCreate(
+ this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser);
+
+ if (result != OPTIX_SUCCESS) {
+ set_error("Failed to create OptiX denoiser");
+ return false;
+ }
+
+ /* OptiX denoiser handle was created with the requested number of input passes. */
+ denoiser_.use_pass_albedo = context.use_pass_albedo;
+ denoiser_.use_pass_normal = context.use_pass_normal;
+
+ /* OptiX denoiser has been created, but it needs configuration. */
+ denoiser_.is_configured = false;
+
+ return true;
+}
+
+bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
+{
+ if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width &&
+ denoiser_.configured_size.y == context.buffer_params.height)) {
+ return true;
+ }
+
+ const BufferParams &buffer_params = context.buffer_params;
+
+ OptixDenoiserSizes sizes = {};
+ optix_assert(optixDenoiserComputeMemoryResources(
+ denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));
+
+ denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
+ denoiser_.scratch_offset = sizes.stateSizeInBytes;
+
+ /* Allocate denoiser state if tile size has changed since last setup. */
+ denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);
+
+ /* Initialize denoiser state for the current tile size. */
+ const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
+ denoiser_.queue.stream(),
+ buffer_params.width,
+ buffer_params.height,
+ denoiser_.state.device_pointer,
+ denoiser_.scratch_offset,
+ denoiser_.state.device_pointer +
+ denoiser_.scratch_offset,
+ denoiser_.scratch_size);
+ if (result != OPTIX_SUCCESS) {
+ set_error("Failed to set up OptiX denoiser");
+ return false;
+ }
+
+ denoiser_.is_configured = true;
+ denoiser_.configured_size.x = buffer_params.width;
+ denoiser_.configured_size.y = buffer_params.height;
+
+ return true;
+}
+
+bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+ const int width = buffer_params.width;
+ const int height = buffer_params.height;
+
+ /* Set up input and output layer information. */
+ OptixImage2D color_layer = {0};
+ OptixImage2D albedo_layer = {0};
+ OptixImage2D normal_layer = {0};
+
+ OptixImage2D output_layer = {0};
+
+ /* Color pass. */
+ {
+ const int pass_denoised = pass.denoised_offset;
+ const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
+
+ color_layer.data = context.render_buffers->buffer.device_pointer +
+ pass_denoised * sizeof(float);
+ color_layer.width = width;
+ color_layer.height = height;
+ color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride;
+ color_layer.pixelStrideInBytes = pass_stride_in_bytes;
+ color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+ }
+
+ device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
+
+ /* Optional albedo and color passes. */
+ if (context.num_input_passes > 1) {
+ const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
+ const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float);
+ const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes;
+
+ if (context.use_pass_albedo) {
+ albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float);
+ albedo_layer.width = width;
+ albedo_layer.height = height;
+ albedo_layer.rowStrideInBytes = row_stride_in_bytes;
+ albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+ albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+ }
+
+ if (context.use_pass_normal) {
+ normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float);
+ normal_layer.width = width;
+ normal_layer.height = height;
+ normal_layer.rowStrideInBytes = row_stride_in_bytes;
+ normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+ normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+ }
+ }
+
+ /* Denoise in-place of the noisy input in the render buffers. */
+ output_layer = color_layer;
+
+ /* Finally run denoising. */
+ OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+ OptixDenoiserLayer image_layers = {};
+ image_layers.input = color_layer;
+ image_layers.output = output_layer;
+
+ OptixDenoiserGuideLayer guide_layers = {};
+ guide_layers.albedo = albedo_layer;
+ guide_layers.normal = normal_layer;
+
+ optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser,
+ denoiser_.queue.stream(),
+ &params,
+ denoiser_.state.device_pointer,
+ denoiser_.scratch_offset,
+ &guide_layers,
+ &image_layers,
+ 1,
+ 0,
+ 0,
+ denoiser_.state.device_pointer + denoiser_.scratch_offset,
+ denoiser_.scratch_size));
+
+ return true;
+}
+
+bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
+ OptixBuildOperation operation,
+ const OptixBuildInput &build_input,
+ uint16_t num_motion_steps)
+{
+ const CUDAContextScope scope(this);
+
+ const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+ /* Compute memory usage. */
+ OptixAccelBufferSizes sizes = {};
+ OptixAccelBuildOptions options = {};
+ options.operation = operation;
+ if (use_fast_trace_bvh) {
+ VLOG(2) << "Using fast to trace OptiX BVH";
+ options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+ }
+ else {
+ VLOG(2) << "Using fast to update OptiX BVH";
+ options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
+ }
+
+ options.motionOptions.numKeys = num_motion_steps;
+ options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+ options.motionOptions.timeBegin = 0.0f;
+ options.motionOptions.timeEnd = 1.0f;
+
+ optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+ /* Allocate required output buffers. */
+ device_only_memory<char> temp_mem(this, "optix temp as build mem");
+ temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+ if (!temp_mem.device_pointer) {
+ /* Make sure temporary memory allocation succeeded. */
+ return false;
+ }
+
+ device_only_memory<char> &out_data = bvh->as_data;
+ if (operation == OPTIX_BUILD_OPERATION_BUILD) {
+ assert(out_data.device == this);
+ out_data.alloc_to_device(sizes.outputSizeInBytes);
+ if (!out_data.device_pointer) {
+ return false;
+ }
+ }
+ else {
+ assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
+ }
+
+ /* Finally build the acceleration structure. */
+ OptixAccelEmitDesc compacted_size_prop = {};
+ compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+ /* A tiny space was allocated for this property at the end of the temporary buffer above.
+ * Make sure this pointer is 8-byte aligned. */
+ compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
+ OptixTraversableHandle out_handle = 0;
+ optix_assert(optixAccelBuild(context,
+ NULL,
+ &options,
+ &build_input,
+ 1,
+ temp_mem.device_pointer,
+ sizes.tempSizeInBytes,
+ out_data.device_pointer,
+ sizes.outputSizeInBytes,
+ &out_handle,
+ use_fast_trace_bvh ? &compacted_size_prop : NULL,
+ use_fast_trace_bvh ? 1 : 0));
+ bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+ /* Wait for all operations to finish. */
+ cuda_assert(cuStreamSynchronize(NULL));
+
+ /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
+ */
+ if (use_fast_trace_bvh) {
+ uint64_t compacted_size = sizes.outputSizeInBytes;
+ cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+ /* Temporary memory is no longer needed, so free it now to make space. */
+ temp_mem.free();
+
+ /* There is no point compacting if the size does not change. */
+ if (compacted_size < sizes.outputSizeInBytes) {
+ device_only_memory<char> compacted_data(this, "optix compacted as");
+ compacted_data.alloc_to_device(compacted_size);
+ if (!compacted_data.device_pointer)
+ /* Do not compact if memory allocation for compacted acceleration structure fails.
+ * Can just use the uncompacted one then, so succeed here regardless. */
+ return !have_error();
+
+ optix_assert(optixAccelCompact(
+ context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
+ bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+ /* Wait for compaction to finish. */
+ cuda_assert(cuStreamSynchronize(NULL));
+
+ std::swap(out_data.device_size, compacted_data.device_size);
+ std::swap(out_data.device_pointer, compacted_data.device_pointer);
+ }
+ }
+
+ return !have_error();
+}
+
+void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+ const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+ free_bvh_memory_delayed();
+
+ BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+ progress.set_substatus("Building OptiX acceleration structure");
+
+ if (!bvh->params.top_level) {
+ assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
+
+ /* Refit is only possible in viewport for now (because AS is built with
+ * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
+ OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+ if (refit && !use_fast_trace_bvh) {
+ assert(bvh_optix->traversable_handle != 0);
+ operation = OPTIX_BUILD_OPERATION_UPDATE;
+ }
+ else {
+ bvh_optix->as_data.free();
+ bvh_optix->traversable_handle = 0;
+ }
+
+ /* Build bottom level acceleration structures (BLAS). */
+ Geometry *const geom = bvh->geometry[0];
+ if (geom->geometry_type == Geometry::HAIR) {
+ /* Build BLAS for curve primitives. */
+ Hair *const hair = static_cast<Hair *const>(geom);
+ if (hair->num_curves() == 0) {
+ return;
+ }
+
+ const size_t num_segments = hair->num_segments();
+
+ size_t num_motion_steps = 1;
+ Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+ if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+ num_motion_steps = hair->get_motion_steps();
+ }
+
+ device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
+ device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+ device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+ /* Four control points for each curve segment. */
+ const size_t num_vertices = num_segments * 4;
+ if (hair->curve_shape == CURVE_THICK) {
+ index_data.alloc(num_segments);
+ vertex_data.alloc(num_vertices * num_motion_steps);
+ }
+ else
+ aabb_data.alloc(num_segments * num_motion_steps);
+
+ /* Get AABBs for each motion step. */
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ /* The center step for motion vertices is not stored in the attribute. */
+ const float3 *keys = hair->get_curve_keys().data();
+ size_t center_step = (num_motion_steps - 1) / 2;
+ if (step != center_step) {
+ size_t attr_offset = (step > center_step) ? step - 1 : step;
+ /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
+ keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
+ }
+
+ for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+ const Hair::Curve curve = hair->get_curve(j);
+ const array<float> &curve_radius = hair->get_curve_radius();
+
+ for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+ if (hair->curve_shape == CURVE_THICK) {
+ int k0 = curve.first_key + segment;
+ int k1 = k0 + 1;
+ int ka = max(k0 - 1, curve.first_key);
+ int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+ const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+ const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+ const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+ const float4 pw = make_float4(
+ curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
+
+ /* Convert Catmull-Rom data to Bezier spline. */
+ static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+ static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+ static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+ static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+ index_data[i] = i * 4;
+ float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+ v[0] = make_float4(
+ dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+ v[1] = make_float4(
+ dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+ v[2] = make_float4(
+ dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+ v[3] = make_float4(
+ dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+ }
+ else {
+ BoundBox bounds = BoundBox::empty;
+ curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
+
+ const size_t index = step * num_segments + i;
+ aabb_data[index].minX = bounds.min.x;
+ aabb_data[index].minY = bounds.min.y;
+ aabb_data[index].minZ = bounds.min.z;
+ aabb_data[index].maxX = bounds.max.x;
+ aabb_data[index].maxY = bounds.max.y;
+ aabb_data[index].maxZ = bounds.max.z;
+ }
+ }
+ }
+ }
+
+ /* Upload AABB data to GPU. */
+ aabb_data.copy_to_device();
+ index_data.copy_to_device();
+ vertex_data.copy_to_device();
+
+ vector<device_ptr> aabb_ptrs;
+ aabb_ptrs.reserve(num_motion_steps);
+ vector<device_ptr> width_ptrs;
+ vector<device_ptr> vertex_ptrs;
+ width_ptrs.reserve(num_motion_steps);
+ vertex_ptrs.reserve(num_motion_steps);
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+ const device_ptr base_ptr = vertex_data.device_pointer +
+ step * num_vertices * sizeof(float4);
+ width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
+ vertex_ptrs.push_back(base_ptr);
+ }
+
+ /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+ unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+ OptixBuildInput build_input = {};
+ if (hair->curve_shape == CURVE_THICK) {
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+ build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+ build_input.curveArray.numPrimitives = num_segments;
+ build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+ build_input.curveArray.numVertices = num_vertices;
+ build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+ build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+ build_input.curveArray.widthStrideInBytes = sizeof(float4);
+ build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+ build_input.curveArray.indexStrideInBytes = sizeof(int);
+ build_input.curveArray.flag = build_flags;
+ build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+ }
+ else {
+ /* Disable visibility test any-hit program, since it is already checked during
+ * intersection. Those trace calls that require anyhit can force it with a ray flag. */
+ build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+ build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+ build_input.customPrimitiveArray.numPrimitives = num_segments;
+ build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+ build_input.customPrimitiveArray.flags = &build_flags;
+ build_input.customPrimitiveArray.numSbtRecords = 1;
+ build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+ }
+
+ if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
+ }
+ }
+ else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
+ /* Build BLAS for triangle primitives. */
+ Mesh *const mesh = static_cast<Mesh *const>(geom);
+ if (mesh->num_triangles() == 0) {
+ return;
+ }
+
+ const size_t num_verts = mesh->get_verts().size();
+
+ size_t num_motion_steps = 1;
+ Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+ if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+ num_motion_steps = mesh->get_motion_steps();
+ }
+
+ device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+ index_data.alloc(mesh->get_triangles().size());
+ memcpy(index_data.data(),
+ mesh->get_triangles().data(),
+ mesh->get_triangles().size() * sizeof(int));
+ device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+ vertex_data.alloc(num_verts * num_motion_steps);
+
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ const float3 *verts = mesh->get_verts().data();
+
+ size_t center_step = (num_motion_steps - 1) / 2;
+ /* The center step for motion vertices is not stored in the attribute. */
+ if (step != center_step) {
+ verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
+ }
+
+ memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+ }
+
+ /* Upload triangle data to GPU. */
+ index_data.copy_to_device();
+ vertex_data.copy_to_device();
+
+ vector<device_ptr> vertex_ptrs;
+ vertex_ptrs.reserve(num_motion_steps);
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+ }
+
+ /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+ unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+ OptixBuildInput build_input = {};
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+ build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+ build_input.triangleArray.numVertices = num_verts;
+ build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+ build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
+ build_input.triangleArray.indexBuffer = index_data.device_pointer;
+ build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+ build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+ build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+ build_input.triangleArray.flags = &build_flags;
+ /* The SBT does not store per primitive data since Cycles already allocates separate
+ * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+ * one and rely on that having the same meaning in this case. */
+ build_input.triangleArray.numSbtRecords = 1;
+ build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
+
+ if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
+ }
+ }
+ }
+ else {
+ unsigned int num_instances = 0;
+ unsigned int max_num_instances = 0xFFFFFFFF;
+
+ bvh_optix->as_data.free();
+ bvh_optix->traversable_handle = 0;
+ bvh_optix->motion_transform_data.free();
+
+ optixDeviceContextGetProperty(context,
+ OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
+ &max_num_instances,
+ sizeof(max_num_instances));
+ /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
+ max_num_instances >>= 1;
+ if (bvh->objects.size() > max_num_instances) {
+ progress.set_error(
+ "Failed to build OptiX acceleration structure because there are too many instances");
+ return;
+ }
+
+ /* Fill instance descriptions. */
+ device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+ instances.alloc(bvh->objects.size());
+
+ /* Calculate total motion transform size and allocate memory for them. */
+ size_t motion_transform_offset = 0;
+ if (motion_blur) {
+ size_t total_motion_transform_size = 0;
+ for (Object *const ob : bvh->objects) {
+ if (ob->is_traceable() && ob->use_motion()) {
+ total_motion_transform_size = align_up(total_motion_transform_size,
+ OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+ const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+ total_motion_transform_size = total_motion_transform_size +
+ sizeof(OptixSRTMotionTransform) +
+ motion_keys * sizeof(OptixSRTData);
+ }
+ }
+
+ assert(bvh_optix->motion_transform_data.device == this);
+ bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+ }
+
+ for (Object *ob : bvh->objects) {
+ /* Skip non-traceable objects. */
+ if (!ob->is_traceable()) {
+ continue;
+ }
+
+ BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+ OptixTraversableHandle handle = blas->traversable_handle;
+
+ OptixInstance &instance = instances[num_instances++];
+ memset(&instance, 0, sizeof(instance));
+
+ /* Clear transform to identity matrix. */
+ instance.transform[0] = 1.0f;
+ instance.transform[5] = 1.0f;
+ instance.transform[10] = 1.0f;
+
+ /* Set user instance ID to object index (but leave low bit blank). */
+ instance.instanceId = ob->get_device_index() << 1;
+
+ /* Have to have at least one bit in the mask, or else instance would always be culled. */
+ instance.visibilityMask = 1;
+
+ if (ob->get_geometry()->has_volume) {
+ /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
+ */
+ instance.visibilityMask |= 2;
+ }
+
+ if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+ /* Same applies to curves (so they can be skipped in local trace calls). */
+ instance.visibilityMask |= 4;
+
+ if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+ static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+ /* Select between motion blur and non-motion blur built-in intersection module. */
+ instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+ }
+ }
+
+ /* Insert motion traversable if object has motion. */
+ if (motion_blur && ob->use_motion()) {
+ size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+ size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+ motion_keys * sizeof(OptixSRTData);
+
+ const CUDAContextScope scope(this);
+
+ motion_transform_offset = align_up(motion_transform_offset,
+ OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+ CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+ motion_transform_offset;
+ motion_transform_offset += motion_transform_size;
+
+ /* Allocate host side memory for motion transform and fill it with transform data. */
+ OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+ new uint8_t[motion_transform_size]);
+ motion_transform.child = handle;
+ motion_transform.motionOptions.numKeys = ob->get_motion().size();
+ motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+ motion_transform.motionOptions.timeBegin = 0.0f;
+ motion_transform.motionOptions.timeEnd = 1.0f;
+
+ OptixSRTData *const srt_data = motion_transform.srtData;
+ array<DecomposedTransform> decomp(ob->get_motion().size());
+ transform_motion_decompose(
+ decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+ for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+ /* Scale. */
+ srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
+ srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
+ srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
+
+ /* Shear. */
+ srt_data[i].a = decomp[i].z.x; /* scale.x.y */
+ srt_data[i].b = decomp[i].z.y; /* scale.x.z */
+ srt_data[i].c = decomp[i].w.x; /* scale.y.z */
+ assert(decomp[i].z.z == 0.0f); /* scale.y.x */
+ assert(decomp[i].w.y == 0.0f); /* scale.z.x */
+ assert(decomp[i].w.z == 0.0f); /* scale.z.y */
+
+ /* Pivot point. */
+ srt_data[i].pvx = 0.0f;
+ srt_data[i].pvy = 0.0f;
+ srt_data[i].pvz = 0.0f;
+
+ /* Rotation. */
+ srt_data[i].qx = decomp[i].x.x;
+ srt_data[i].qy = decomp[i].x.y;
+ srt_data[i].qz = decomp[i].x.z;
+ srt_data[i].qw = decomp[i].x.w;
+
+ /* Translation. */
+ srt_data[i].tx = decomp[i].y.x;
+ srt_data[i].ty = decomp[i].y.y;
+ srt_data[i].tz = decomp[i].y.z;
+ }
+
+ /* Upload motion transform to GPU. */
+ cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+ delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+
+ /* Disable instance transform if object uses motion transform already. */
+ instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+ /* Get traversable handle to motion transform. */
+ optixConvertPointerToTraversableHandle(context,
+ motion_transform_gpu,
+ OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+ &instance.traversableHandle);
+ }
+ else {
+ instance.traversableHandle = handle;
+
+ if (ob->get_geometry()->is_instanced()) {
+ /* Set transform matrix. */
+ memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+ }
+ else {
+ /* Disable instance transform if geometry already has it applied to vertex data. */
+ instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+ /* Non-instanced objects read ID from 'prim_object', so distinguish
+ * them from instanced objects with the low bit set. */
+ instance.instanceId |= 1;
+ }
+ }
+ }
+
+ /* Upload instance descriptions. */
+ instances.resize(num_instances);
+ instances.copy_to_device();
+
+ /* Build top-level acceleration structure (TLAS) */
+ OptixBuildInput build_input = {};
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+ build_input.instanceArray.instances = instances.device_pointer;
+ build_input.instanceArray.numInstances = num_instances;
+
+ if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
+ }
+ tlas_handle = bvh_optix->traversable_handle;
+ }
+}
+
+void OptiXDevice::release_optix_bvh(BVH *bvh)
+{
+ thread_scoped_lock lock(delayed_free_bvh_mutex);
+ /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
+ * while GPU is still rendering. */
+ BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+ delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
+ delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
+ bvh_optix->traversable_handle = 0;
+}
+
+void OptiXDevice::free_bvh_memory_delayed()
+{
+ thread_scoped_lock lock(delayed_free_bvh_mutex);
+ delayed_free_bvh_memory.free_memory();
+}
+
+void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+ /* Set constant memory for CUDA module. */
+ CUDADevice::const_copy_to(name, host, size);
+
+ if (strcmp(name, "__data") == 0) {
+ assert(size <= sizeof(KernelData));
+
+ /* Update traversable handle (since it is different for each device on multi devices). */
+ KernelData *const data = (KernelData *)host;
+ *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+ update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
+ return;
+ }
+
+ /* Update data storage pointers in launch parameters. */
+# define KERNEL_TEX(data_type, tex_name) \
+ if (strcmp(name, #tex_name) == 0) { \
+ update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
+ return; \
+ }
+ KERNEL_TEX(IntegratorStateGPU, __integrator_state)
+# include "kernel/kernel_textures.h"
+# undef KERNEL_TEX
+}
+
+void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
+{
+ const CUDAContextScope scope(this);
+
+ cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
new file mode 100644
index 00000000000..91ef52e0a5a
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+# include "device/cuda/device_impl.h"
+# include "device/optix/queue.h"
+# include "device/optix/util.h"
+# include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHOptiX;
+struct KernelParamsOptiX;
+
+/* List of OptiX program groups. */
+enum {
+ PG_RGEN_INTERSECT_CLOSEST,
+ PG_RGEN_INTERSECT_SHADOW,
+ PG_RGEN_INTERSECT_SUBSURFACE,
+ PG_RGEN_INTERSECT_VOLUME_STACK,
+ PG_RGEN_SHADE_SURFACE_RAYTRACE,
+ PG_MISS,
+ PG_HITD, /* Default hit group. */
+ PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
+ PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */
+ PG_HITD_MOTION,
+ PG_HITS_MOTION,
+ PG_CALL_SVM_AO,
+ PG_CALL_SVM_BEVEL,
+ PG_CALL_AO_PASS,
+ NUM_PROGRAM_GROUPS
+};
+
+static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
+static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
+static const int NUM_HIT_PROGRAM_GROUPS = 5;
+static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
+static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;
+
+/* List of OptiX pipelines. */
+enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES };
+
+/* A single shader binding table entry. */
+struct SbtRecord {
+ char header[OPTIX_SBT_RECORD_HEADER_SIZE];
+};
+
+class OptiXDevice : public CUDADevice {
+ public:
+ OptixDeviceContext context = NULL;
+
+ OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
+ OptixModule builtin_modules[2] = {};
+ OptixPipeline pipelines[NUM_PIPELINES] = {};
+
+ bool motion_blur = false;
+ device_vector<SbtRecord> sbt_data;
+ device_only_memory<KernelParamsOptiX> launch_params;
+ OptixTraversableHandle tlas_handle = 0;
+
+ vector<device_only_memory<char>> delayed_free_bvh_memory;
+ thread_mutex delayed_free_bvh_mutex;
+
+ class Denoiser {
+ public:
+ explicit Denoiser(OptiXDevice *device);
+ ~Denoiser();
+
+ OptiXDevice *device;
+ OptiXDeviceQueue queue;
+
+ OptixDenoiser optix_denoiser = nullptr;
+
+ /* Configuration size, as provided to `optixDenoiserSetup`.
+ * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
+ * `is_configured` will be false. */
+ bool is_configured = false;
+ int2 configured_size = make_int2(0, 0);
+
+ /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
+ * The memory layout goes as following: [denoiser state][scratch buffer]. */
+ device_only_memory<unsigned char> state;
+ size_t scratch_offset = 0;
+ size_t scratch_size = 0;
+
+ bool use_pass_albedo = false;
+ bool use_pass_normal = false;
+ };
+ Denoiser denoiser_;
+
+ public:
+ OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+ ~OptiXDevice();
+
+ private:
+ BVHLayoutMask get_bvh_layout_mask() const override;
+
+ string compile_kernel_get_common_cflags(const uint kernel_features) override;
+
+ bool load_kernels(const uint kernel_features) override;
+
+ bool build_optix_bvh(BVHOptiX *bvh,
+ OptixBuildOperation operation,
+ const OptixBuildInput &build_input,
+ uint16_t num_motion_steps);
+
+ void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+ void release_optix_bvh(BVH *bvh) override;
+ void free_bvh_memory_delayed();
+
+ void const_copy_to(const char *name, void *host, size_t size) override;
+
+ void update_launch_params(size_t offset, void *data, size_t data_size);
+
+ virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+ /* --------------------------------------------------------------------
+ * Denoising.
+ */
+
+ class DenoiseContext;
+ class DenoisePass;
+
+ virtual bool denoise_buffer(const DeviceDenoiseTask &task) override;
+ virtual DeviceQueue *get_denoise_queue() override;
+
+ /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
+ * OptiX and store in the guiding passes memory within the given context.
+ *
+ * Pre=-processing of the guiding passes is to only happen once per context lifetime. DO not
+ * preprocess them for every pass which is being denoised. */
+ bool denoise_filter_guiding_preprocess(DenoiseContext &context);
+
+ /* Set fake albedo pixels in the albedo guiding pass storage.
+ * After this point only passes which do not need albedo for denoising can be processed. */
+ bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context);
+
+ void denoise_pass(DenoiseContext &context, PassType pass_type);
+
+ /* Read input color pass from the render buffer into the memory which corresponds to the noisy
+ * input within the given context. Pixels are scaled to the number of samples, but are not
+ * preprocessed yet. */
+ void denoise_color_read(DenoiseContext &context, const DenoisePass &pass);
+
+ /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
+ * denoiser result to the render buffer. */
+ bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass);
+ bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass);
+
+ /* Make sure the OptiX denoiser is created and configured. */
+ bool denoise_ensure(DenoiseContext &context);
+
+ /* Create OptiX denoiser descriptor if needed.
+ * Will do nothing if the current OptiX descriptor is usable for the given parameters.
+ * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
+ bool denoise_create_if_needed(DenoiseContext &context);
+
+ /* Configure existing OptiX denoiser descriptor for the use for the given task. */
+ bool denoise_configure_if_needed(DenoiseContext &context);
+
+ /* Run configured denoiser. */
+ bool denoise_run(DenoiseContext &context, const DenoisePass &pass);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
new file mode 100644
index 00000000000..458ed70baa8
--- /dev/null
+++ b/intern/cycles/device/optix/queue.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+# include "device/optix/queue.h"
+# include "device/optix/device_impl.h"
+
+# include "util/util_time.h"
+
+# undef __KERNEL_CPU__
+# define __KERNEL_OPTIX__
+# include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+OptiXDeviceQueue::OptiXDeviceQueue(OptiXDevice *device) : CUDADeviceQueue(device)
+{
+}
+
+void OptiXDeviceQueue::init_execution()
+{
+ CUDADeviceQueue::init_execution();
+}
+
+static bool is_optix_specific_kernel(DeviceKernel kernel)
+{
+ return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+}
+
+bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+ if (!is_optix_specific_kernel(kernel)) {
+ return CUDADeviceQueue::enqueue(kernel, work_size, args);
+ }
+
+ if (cuda_device_->have_error()) {
+ return false;
+ }
+
+ debug_enqueue(kernel, work_size);
+
+ const CUDAContextScope scope(cuda_device_);
+
+ OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+ const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
+ const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
+
+ cuda_device_assert(
+ cuda_device_,
+ cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array),
+ args[0], // &d_path_index
+ sizeof(device_ptr),
+ cuda_stream_));
+
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ cuda_device_assert(
+ cuda_device_,
+ cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
+ args[1], // &d_render_buffer
+ sizeof(device_ptr),
+ cuda_stream_));
+ }
+
+ cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+
+ OptixPipeline pipeline = nullptr;
+ OptixShaderBindingTable sbt_params = {};
+
+ switch (kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+ pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+ pipeline = optix_device->pipelines[PIP_INTERSECT];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+ pipeline = optix_device->pipelines[PIP_INTERSECT];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SHADOW * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+ pipeline = optix_device->pipelines[PIP_INTERSECT];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SUBSURFACE * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+ pipeline = optix_device->pipelines[PIP_INTERSECT];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
+ break;
+
+ default:
+ LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
+ << " is attempted to be enqueued.";
+ return false;
+ }
+
+ sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
+ sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+ sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+ sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
+ sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+ sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
+ sbt_params.callablesRecordBase = sbt_data_ptr + CALLABLE_PROGRAM_GROUPS_BASE * sizeof(SbtRecord);
+ sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
+ sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
+
+ /* Launch the ray generation program. */
+ optix_device_assert(optix_device,
+ optixLaunch(pipeline,
+ cuda_stream_,
+ launch_params_ptr,
+ optix_device->launch_params.data_elements,
+ &sbt_params,
+ work_size,
+ 1,
+ 1));
+
+ return !(optix_device->have_error());
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.h b/intern/cycles/device/optix/queue.h
new file mode 100644
index 00000000000..0de422ccc71
--- /dev/null
+++ b/intern/cycles/device/optix/queue.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+# include "device/cuda/queue.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OptiXDevice;
+
+/* Base class for CUDA queues. */
+class OptiXDeviceQueue : public CUDADeviceQueue {
+ public:
+ OptiXDeviceQueue(OptiXDevice *device);
+
+ virtual void init_execution() override;
+
+ virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/util.h b/intern/cycles/device/optix/util.h
new file mode 100644
index 00000000000..34ae5bb5609
--- /dev/null
+++ b/intern/cycles/device/optix/util.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+# include "device/cuda/util.h"
+
+# ifdef WITH_CUDA_DYNLOAD
+# include <cuew.h>
+// Do not use CUDA SDK headers when using CUEW
+# define OPTIX_DONT_INCLUDE_CUDA
+# endif
+
+# include <optix_stubs.h>
+
+/* Utility for checking return values of OptiX function calls. */
+# define optix_device_assert(optix_device, stmt) \
+ { \
+ OptixResult result = stmt; \
+ if (result != OPTIX_SUCCESS) { \
+ const char *name = optixGetErrorName(result); \
+ optix_device->set_error( \
+ string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+ } \
+ } \
+ (void)0
+
+# define optix_assert(stmt) optix_device_assert(this, stmt)
+
+#endif /* WITH_OPTIX */