diff options
Diffstat (limited to 'intern/cycles/device')
65 files changed, 6970 insertions, 15812 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 928249931a3..d18f4360aef 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -36,49 +36,70 @@ endif() set(SRC device.cpp - device_cpu.cpp - device_cuda.cpp - device_denoising.cpp - device_dummy.cpp + device_denoise.cpp + device_graphics_interop.cpp + device_kernel.cpp device_memory.cpp - device_multi.cpp - device_opencl.cpp - device_optix.cpp - device_split_kernel.cpp - device_task.cpp + device_queue.cpp +) + +set(SRC_CPU + cpu/device.cpp + cpu/device.h + cpu/device_impl.cpp + cpu/device_impl.h + cpu/kernel.cpp + cpu/kernel.h + cpu/kernel_function.h + cpu/kernel_thread_globals.cpp + cpu/kernel_thread_globals.h ) set(SRC_CUDA - cuda/device_cuda.h - cuda/device_cuda_impl.cpp + cuda/device.cpp + cuda/device.h + cuda/device_impl.cpp + cuda/device_impl.h + cuda/graphics_interop.cpp + cuda/graphics_interop.h + cuda/kernel.cpp + cuda/kernel.h + cuda/queue.cpp + cuda/queue.h + cuda/util.cpp + cuda/util.h ) -set(SRC_OPENCL - opencl/device_opencl.h - opencl/device_opencl_impl.cpp - opencl/memory_manager.h - opencl/memory_manager.cpp - opencl/opencl_util.cpp +set(SRC_DUMMY + dummy/device.cpp + dummy/device.h ) -if(WITH_CYCLES_NETWORK) - list(APPEND SRC - device_network.cpp - ) -endif() +set(SRC_MULTI + multi/device.cpp + multi/device.h +) + +set(SRC_OPTIX + optix/device.cpp + optix/device.h + optix/device_impl.cpp + optix/device_impl.h + optix/queue.cpp + optix/queue.h + optix/util.h +) set(SRC_HEADERS device.h - device_denoising.h + device_denoise.h + device_graphics_interop.h device_memory.h - device_intern.h - device_network.h - device_split_kernel.h - device_task.h + device_kernel.h + device_queue.h ) set(LIB - cycles_render cycles_kernel cycles_util ${CYCLES_GL_LIBRARIES} @@ -95,15 +116,7 @@ else() endif() add_definitions(${GL_DEFINITIONS}) -if(WITH_CYCLES_NETWORK) - add_definitions(-DWITH_NETWORK) -endif() -if(WITH_CYCLES_DEVICE_OPENCL) - list(APPEND LIB - extern_clew - ) - add_definitions(-DWITH_OPENCL) -endif() + if(WITH_CYCLES_DEVICE_CUDA) add_definitions(-DWITH_CUDA) endif() @@ -115,18 +128,27 @@ if(WITH_CYCLES_DEVICE_MULTI) endif() if(WITH_OPENIMAGEDENOISE) - add_definitions(-DWITH_OPENIMAGEDENOISE) - add_definitions(-DOIDN_STATIC_LIB) - list(APPEND INC_SYS - ${OPENIMAGEDENOISE_INCLUDE_DIRS} - ) list(APPEND LIB ${OPENIMAGEDENOISE_LIBRARIES} - ${TBB_LIBRARIES} ) endif() include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS}) +cycles_add_library(cycles_device "${LIB}" + ${SRC} + ${SRC_CPU} + ${SRC_CUDA} + ${SRC_DUMMY} + ${SRC_MULTI} + ${SRC_OPTIX} + ${SRC_HEADERS} +) + +source_group("cpu" FILES ${SRC_CPU}) +source_group("cuda" FILES ${SRC_CUDA}) +source_group("dummy" FILES ${SRC_DUMMY}) +source_group("multi" FILES ${SRC_MULTI}) +source_group("optix" FILES ${SRC_OPTIX}) +source_group("common" FILES ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp new file mode 100644 index 00000000000..68ca8e8bb22 --- /dev/null +++ b/intern/cycles/device/cpu/device.cpp @@ -0,0 +1,64 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/cpu/device.h" +#include "device/cpu/device_impl.h" + +/* Used for `info.denoisers`. */ +/* TODO(sergey): The denoisers are probably to be moved completely out of the device into their + * own class. But until then keep API consistent with how it used to work before. */ +#include "util/util_openimagedenoise.h" + +CCL_NAMESPACE_BEGIN + +Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ + return new CPUDevice(info, stats, profiler); +} + +void device_cpu_info(vector<DeviceInfo> &devices) +{ + DeviceInfo info; + + info.type = DEVICE_CPU; + info.description = system_cpu_brand_string(); + info.id = "CPU"; + info.num = 0; + info.has_osl = true; + info.has_half_images = true; + info.has_nanovdb = true; + info.has_profiling = true; + if (openimagedenoise_supported()) { + info.denoisers |= DENOISER_OPENIMAGEDENOISE; + } + + devices.insert(devices.begin(), info); +} + +string device_cpu_capabilities() +{ + string capabilities = ""; + capabilities += system_cpu_support_sse2() ? "SSE2 " : ""; + capabilities += system_cpu_support_sse3() ? "SSE3 " : ""; + capabilities += system_cpu_support_sse41() ? "SSE41 " : ""; + capabilities += system_cpu_support_avx() ? "AVX " : ""; + capabilities += system_cpu_support_avx2() ? "AVX2" : ""; + if (capabilities[capabilities.size() - 1] == ' ') + capabilities.resize(capabilities.size() - 1); + return capabilities; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/device.h b/intern/cycles/device/cpu/device.h new file mode 100644 index 00000000000..9cb2e80068d --- /dev/null +++ b/intern/cycles/device/cpu/device.h @@ -0,0 +1,35 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_string.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_cpu_info(vector<DeviceInfo> &devices); + +string device_cpu_capabilities(); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp new file mode 100644 index 00000000000..3b0db6bdd0e --- /dev/null +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -0,0 +1,481 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/cpu/device_impl.h" + +#include <stdlib.h> +#include <string.h> + +/* So ImathMath is included before our kernel_cpu_compat. */ +#ifdef WITH_OSL +/* So no context pollution happens from indirectly included windows.h */ +# include "util/util_windows.h" +# include <OSL/oslexec.h> +#endif + +#ifdef WITH_EMBREE +# include <embree3/rtcore.h> +#endif + +#include "device/cpu/kernel.h" +#include "device/cpu/kernel_thread_globals.h" + +#include "device/device.h" + +// clang-format off +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" +#include "kernel/device/cpu/kernel.h" +#include "kernel/kernel_types.h" + +#include "kernel/osl/osl_shader.h" +#include "kernel/osl/osl_globals.h" +// clang-format on + +#include "bvh/bvh_embree.h" + +#include "render/buffers.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_opengl.h" +#include "util/util_openimagedenoise.h" +#include "util/util_optimization.h" +#include "util/util_progress.h" +#include "util/util_system.h" +#include "util/util_task.h" +#include "util/util_thread.h" + +CCL_NAMESPACE_BEGIN + +CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_) + : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + /* Pick any kernel, all of them are supposed to have same level of microarchitecture + * optimization. */ + VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name() + << " kernels."; + + if (info.cpu_threads == 0) { + info.cpu_threads = TaskScheduler::num_threads(); + } + +#ifdef WITH_OSL + kernel_globals.osl = &osl_globals; +#endif +#ifdef WITH_EMBREE + embree_device = rtcNewDevice("verbose=0"); +#endif + need_texture_info = false; +} + +CPUDevice::~CPUDevice() +{ +#ifdef WITH_EMBREE + rtcReleaseDevice(embree_device); +#endif + + texture_info.free(); +} + +bool CPUDevice::show_samples() const +{ + return (info.cpu_threads == 1); +} + +BVHLayoutMask CPUDevice::get_bvh_layout_mask() const +{ + BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; +#ifdef WITH_EMBREE + bvh_layout_mask |= BVH_LAYOUT_EMBREE; +#endif /* WITH_EMBREE */ + return bvh_layout_mask; +} + +bool CPUDevice::load_texture_info() +{ + if (!need_texture_info) { + return false; + } + + texture_info.copy_to_device(); + need_texture_info = false; + + return true; +} + +void CPUDevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + if (mem.type == MEM_DEVICE_ONLY) { + assert(!mem.host_pointer); + size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES; + void *data = util_aligned_malloc(mem.memory_size(), alignment); + mem.device_pointer = (device_ptr)data; + } + else { + mem.device_pointer = (device_ptr)mem.host_pointer; + } + + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + } +} + +void CPUDevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + mem_alloc(mem); + } + + /* copy is no-op */ + } +} + +void CPUDevice::mem_copy_from( + device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) +{ + /* no-op */ +} + +void CPUDevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + + if (mem.device_pointer) { + memset((void *)mem.device_pointer, 0, mem.memory_size()); + } +} + +void CPUDevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else if (mem.device_pointer) { + if (mem.type == MEM_DEVICE_ONLY) { + util_aligned_free((void *)mem.device_pointer); + } + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } +} + +device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) +{ + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); +} + +void CPUDevice::const_copy_to(const char *name, void *host, size_t size) +{ +#if WITH_EMBREE + if (strcmp(name, "__data") == 0) { + assert(size <= sizeof(KernelData)); + + // Update scene handle (since it is different for each device on multi devices) + KernelData *const data = (KernelData *)host; + data->bvh.scene = embree_scene; + } +#endif + kernel_const_copy(&kernel_globals, name, host, size); +} + +void CPUDevice::global_alloc(device_memory &mem) +{ + VLOG(1) << "Global memory allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size); + + mem.device_pointer = (device_ptr)mem.host_pointer; + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); +} + +void CPUDevice::global_free(device_memory &mem) +{ + if (mem.device_pointer) { + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } +} + +void CPUDevice::tex_alloc(device_texture &mem) +{ + VLOG(1) << "Texture allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + mem.device_pointer = (device_ptr)mem.host_pointer; + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount of re-allocations. */ + texture_info.resize(slot + 128); + } + + texture_info[slot] = mem.info; + texture_info[slot].data = (uint64_t)mem.host_pointer; + need_texture_info = true; +} + +void CPUDevice::tex_free(device_texture &mem) +{ + if (mem.device_pointer) { + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + need_texture_info = true; + } +} + +void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) +{ +#ifdef WITH_EMBREE + if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE || + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) { + BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh); + if (refit) { + bvh_embree->refit(progress); + } + else { + bvh_embree->build(progress, &stats, embree_device); + } + + if (bvh->params.top_level) { + embree_scene = bvh_embree->scene; + } + } + else +#endif + Device::build_bvh(bvh, progress, refit); +} + +#if 0 +void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) +{ + const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; + + scoped_timer timer(&tile.buffers->render_time); + + Coverage coverage(kg, tile); + if (use_coverage) { + coverage.init_path_trace(); + } + + float *render_buffer = (float *)tile.buffer; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + + /* Needed for Embree. */ + SIMD_SET_FLUSH_TO_ZERO; + + for (int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || TaskPool::canceled()) { + if (task.need_finish_queue == false) + break; + } + + if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) { + tile.stealing_state = RenderTile::WAS_STOLEN; + break; + } + + if (tile.task == RenderTile::PATH_TRACE) { + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + if (use_coverage) { + coverage.init_pixel(x, y); + } + kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride); + } + } + } + else { + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride); + } + } + } + tile.sample = sample + 1; + + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) { + const bool stop = adaptive_sampling_filter(kg, tile, sample); + if (stop) { + const int num_progress_samples = end_sample - sample; + tile.sample = end_sample; + task.update_progress(&tile, tile.w * tile.h * num_progress_samples); + break; + } + } + + task.update_progress(&tile, tile.w * tile.h); + } + if (use_coverage) { + coverage.finalize(); + } + + if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) { + adaptive_sampling_post(tile, kg); + } +} + +void CPUDevice::thread_render(DeviceTask &task) +{ + if (TaskPool::canceled()) { + if (task.need_finish_queue == false) + return; + } + + /* allocate buffer for kernel globals */ + CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory()); + + profiler.add_state(&kg.profiler); + + /* NLM denoiser. */ + DenoisingTask *denoising = NULL; + + /* OpenImageDenoise: we can only denoise with one thread at a time, so to + * avoid waiting with mutex locks in the denoiser, we let only a single + * thread acquire denoising tiles. */ + uint tile_types = task.tile_types; + bool hold_denoise_lock = false; + if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + if (!oidn_task_lock.try_lock()) { + tile_types &= ~RenderTile::DENOISE; + hold_denoise_lock = true; + } + } + + RenderTile tile; + while (task.acquire_tile(this, tile, tile_types)) { + if (tile.task == RenderTile::PATH_TRACE) { + render(task, tile, &kg); + } + else if (tile.task == RenderTile::BAKE) { + render(task, tile, &kg); + } + else if (tile.task == RenderTile::DENOISE) { + denoise_openimagedenoise(task, tile); + task.update_progress(&tile, tile.w * tile.h); + } + + task.release_tile(tile); + + if (TaskPool::canceled()) { + if (task.need_finish_queue == false) + break; + } + } + + if (hold_denoise_lock) { + oidn_task_lock.unlock(); + } + + profiler.remove_state(&kg.profiler); + + delete denoising; +} + +void CPUDevice::thread_denoise(DeviceTask &task) +{ + RenderTile tile; + tile.x = task.x; + tile.y = task.y; + tile.w = task.w; + tile.h = task.h; + tile.buffer = task.buffer; + tile.sample = task.sample + task.num_samples; + tile.num_samples = task.num_samples; + tile.start_sample = task.sample; + tile.offset = task.offset; + tile.stride = task.stride; + tile.buffers = task.buffers; + + denoise_openimagedenoise(task, tile); + + task.update_progress(&tile, tile.w * tile.h); +} +#endif + +const CPUKernels *CPUDevice::get_cpu_kernels() const +{ + return &kernels; +} + +void CPUDevice::get_cpu_kernel_thread_globals( + vector<CPUKernelThreadGlobals> &kernel_thread_globals) +{ + /* Ensure latest texture info is loaded into kernel globals before returning. */ + load_texture_info(); + + kernel_thread_globals.clear(); + void *osl_memory = get_cpu_osl_memory(); + for (int i = 0; i < info.cpu_threads; i++) { + kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler); + } +} + +void *CPUDevice::get_cpu_osl_memory() +{ +#ifdef WITH_OSL + return &osl_globals; +#else + return NULL; +#endif +} + +bool CPUDevice::load_kernels(const uint /*kernel_features*/) +{ + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h new file mode 100644 index 00000000000..7d222808652 --- /dev/null +++ b/intern/cycles/device/cpu/device_impl.h @@ -0,0 +1,99 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +/* So ImathMath is included before our kernel_cpu_compat. */ +#ifdef WITH_OSL +/* So no context pollution happens from indirectly included windows.h */ +# include "util/util_windows.h" +# include <OSL/oslexec.h> +#endif + +#ifdef WITH_EMBREE +# include <embree3/rtcore.h> +#endif + +#include "device/cpu/kernel.h" +#include "device/device.h" +#include "device/device_memory.h" + +// clang-format off +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/kernel.h" +#include "kernel/device/cpu/globals.h" + +#include "kernel/osl/osl_shader.h" +#include "kernel/osl/osl_globals.h" +// clang-format on + +CCL_NAMESPACE_BEGIN + +class CPUDevice : public Device { + public: + KernelGlobals kernel_globals; + + device_vector<TextureInfo> texture_info; + bool need_texture_info; + +#ifdef WITH_OSL + OSLGlobals osl_globals; +#endif +#ifdef WITH_EMBREE + RTCScene embree_scene = NULL; + RTCDevice embree_device; +#endif + + CPUKernels kernels; + + CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_); + ~CPUDevice(); + + virtual bool show_samples() const override; + + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + /* Returns true if the texture info was copied to the device (meaning, some more + * re-initialization might be needed). */ + bool load_texture_info(); + + virtual void mem_alloc(device_memory &mem) override; + virtual void mem_copy_to(device_memory &mem) override; + virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; + virtual void mem_zero(device_memory &mem) override; + virtual void mem_free(device_memory &mem) override; + virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + void tex_free(device_texture &mem); + + void build_bvh(BVH *bvh, Progress &progress, bool refit) override; + + virtual const CPUKernels *get_cpu_kernels() const override; + virtual void get_cpu_kernel_thread_globals( + vector<CPUKernelThreadGlobals> &kernel_thread_globals) override; + virtual void *get_cpu_osl_memory() override; + + protected: + virtual bool load_kernels(uint /*kernel_features*/) override; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp new file mode 100644 index 00000000000..91282390e27 --- /dev/null +++ b/intern/cycles/device/cpu/kernel.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/cpu/kernel.h" + +#include "kernel/device/cpu/kernel.h" + +CCL_NAMESPACE_BEGIN + +#define KERNEL_FUNCTIONS(name) \ + KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \ + KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \ + KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name) + +#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name)) + +CPUKernels::CPUKernels() + : /* Integrator. */ + REGISTER_KERNEL(integrator_init_from_camera), + REGISTER_KERNEL(integrator_init_from_bake), + REGISTER_KERNEL(integrator_intersect_closest), + REGISTER_KERNEL(integrator_intersect_shadow), + REGISTER_KERNEL(integrator_intersect_subsurface), + REGISTER_KERNEL(integrator_intersect_volume_stack), + REGISTER_KERNEL(integrator_shade_background), + REGISTER_KERNEL(integrator_shade_light), + REGISTER_KERNEL(integrator_shade_shadow), + REGISTER_KERNEL(integrator_shade_surface), + REGISTER_KERNEL(integrator_shade_volume), + REGISTER_KERNEL(integrator_megakernel), + /* Shader evaluation. */ + REGISTER_KERNEL(shader_eval_displace), + REGISTER_KERNEL(shader_eval_background), + /* Adaptive sampling. */ + REGISTER_KERNEL(adaptive_sampling_convergence_check), + REGISTER_KERNEL(adaptive_sampling_filter_x), + REGISTER_KERNEL(adaptive_sampling_filter_y), + /* Cryptomatte. */ + REGISTER_KERNEL(cryptomatte_postprocess), + /* Bake. */ + REGISTER_KERNEL(bake) +{ +} + +#undef REGISTER_KERNEL +#undef KERNEL_FUNCTIONS + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h new file mode 100644 index 00000000000..54b18308544 --- /dev/null +++ b/intern/cycles/device/cpu/kernel.h @@ -0,0 +1,111 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/cpu/kernel_function.h" +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +struct KernelGlobals; +struct IntegratorStateCPU; +struct TileInfo; + +class CPUKernels { + public: + /* Integrator. */ + + using IntegratorFunction = + CPUKernelFunction<void (*)(const KernelGlobals *kg, IntegratorStateCPU *state)>; + using IntegratorShadeFunction = CPUKernelFunction<void (*)( + const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>; + using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobals *kg, + IntegratorStateCPU *state, + KernelWorkTile *tile, + ccl_global float *render_buffer)>; + + IntegratorInitFunction integrator_init_from_camera; + IntegratorInitFunction integrator_init_from_bake; + IntegratorFunction integrator_intersect_closest; + IntegratorFunction integrator_intersect_shadow; + IntegratorFunction integrator_intersect_subsurface; + IntegratorFunction integrator_intersect_volume_stack; + IntegratorShadeFunction integrator_shade_background; + IntegratorShadeFunction integrator_shade_light; + IntegratorShadeFunction integrator_shade_shadow; + IntegratorShadeFunction integrator_shade_surface; + IntegratorShadeFunction integrator_shade_volume; + IntegratorShadeFunction integrator_megakernel; + + /* Shader evaluation. */ + + using ShaderEvalFunction = CPUKernelFunction<void (*)( + const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>; + + ShaderEvalFunction shader_eval_displace; + ShaderEvalFunction shader_eval_background; + + /* Adaptive stopping. */ + + using AdaptiveSamplingConvergenceCheckFunction = + CPUKernelFunction<bool (*)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int y, + float threshold, + bool reset, + int offset, + int stride)>; + + using AdaptiveSamplingFilterXFunction = + CPUKernelFunction<void (*)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int y, + int start_x, + int width, + int offset, + int stride)>; + + using AdaptiveSamplingFilterYFunction = + CPUKernelFunction<void (*)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int start_y, + int height, + int offset, + int stride)>; + + AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check; + + AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x; + AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y; + + /* Cryptomatte. */ + + using CryptomattePostprocessFunction = CPUKernelFunction<void (*)( + const KernelGlobals *kg, ccl_global float *render_buffer, int pixel_index)>; + + CryptomattePostprocessFunction cryptomatte_postprocess; + + /* Bake. */ + + CPUKernelFunction<void (*)(const KernelGlobals *, float *, int, int, int, int, int)> bake; + + CPUKernels(); +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h new file mode 100644 index 00000000000..aa18720cc24 --- /dev/null +++ b/intern/cycles/device/cpu/kernel_function.h @@ -0,0 +1,124 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_debug.h" +#include "util/util_system.h" + +CCL_NAMESPACE_BEGIN + +/* A wrapper around per-microarchitecture variant of a kernel function. + * + * Provides a function-call-like API which gets routed to the most suitable implementation. + * + * For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */ +template<typename FunctionType> class CPUKernelFunction { + public: + CPUKernelFunction(FunctionType kernel_default, + FunctionType kernel_sse2, + FunctionType kernel_sse3, + FunctionType kernel_sse41, + FunctionType kernel_avx, + FunctionType kernel_avx2) + { + kernel_info_ = get_best_kernel_info( + kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2); + } + + template<typename... Args> inline auto operator()(Args... args) const + { + assert(kernel_info_.kernel); + + return kernel_info_.kernel(args...); + } + + const char *get_uarch_name() const + { + return kernel_info_.uarch_name; + } + + protected: + /* Helper class which allows to pass human-readable microarchitecture name together with function + * pointer. */ + class KernelInfo { + public: + KernelInfo() : KernelInfo("", nullptr) + { + } + + /* TODO(sergey): Use string view, to have higher-level functionality (i.e. comparison) without + * memory allocation. */ + KernelInfo(const char *uarch_name, FunctionType kernel) + : uarch_name(uarch_name), kernel(kernel) + { + } + + const char *uarch_name; + FunctionType kernel; + }; + + KernelInfo get_best_kernel_info(FunctionType kernel_default, + FunctionType kernel_sse2, + FunctionType kernel_sse3, + FunctionType kernel_sse41, + FunctionType kernel_avx, + FunctionType kernel_avx2) + { + /* Silence warnings about unused variables when compiling without some architectures. */ + (void)kernel_sse2; + (void)kernel_sse3; + (void)kernel_sse41; + (void)kernel_avx; + (void)kernel_avx2; + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { + return KernelInfo("AVX2", kernel_avx2); + } +#endif + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) { + return KernelInfo("AVX", kernel_avx); + } +#endif + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) { + return KernelInfo("SSE4.1", kernel_sse41); + } +#endif + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) { + return KernelInfo("SSE3", kernel_sse3); + } +#endif + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { + return KernelInfo("SSE2", kernel_sse2); + } +#endif + + return KernelInfo("default", kernel_default); + } + + KernelInfo kernel_info_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp new file mode 100644 index 00000000000..988b00cd1f0 --- /dev/null +++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp @@ -0,0 +1,85 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/cpu/kernel_thread_globals.h" + +// clang-format off +#include "kernel/osl/osl_shader.h" +#include "kernel/osl/osl_globals.h" +// clang-format on + +#include "util/util_profiling.h" + +CCL_NAMESPACE_BEGIN + +CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globals, + void *osl_globals_memory, + Profiler &cpu_profiler) + : KernelGlobals(kernel_globals), cpu_profiler_(cpu_profiler) +{ + reset_runtime_memory(); + +#ifdef WITH_OSL + OSLShader::thread_init(this, reinterpret_cast<OSLGlobals *>(osl_globals_memory)); +#else + (void)osl_globals_memory; +#endif +} + +CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept + : KernelGlobals(std::move(other)), cpu_profiler_(other.cpu_profiler_) +{ + other.reset_runtime_memory(); +} + +CPUKernelThreadGlobals::~CPUKernelThreadGlobals() +{ +#ifdef WITH_OSL + OSLShader::thread_free(this); +#endif +} + +CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other) +{ + if (this == &other) { + return *this; + } + + *static_cast<KernelGlobals *>(this) = *static_cast<KernelGlobals *>(&other); + + other.reset_runtime_memory(); + + return *this; +} + +void CPUKernelThreadGlobals::reset_runtime_memory() +{ +#ifdef WITH_OSL + osl = nullptr; +#endif +} + +void CPUKernelThreadGlobals::start_profiling() +{ + cpu_profiler_.add_state(&profiler); +} + +void CPUKernelThreadGlobals::stop_profiling() +{ + cpu_profiler_.remove_state(&profiler); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel_thread_globals.h b/intern/cycles/device/cpu/kernel_thread_globals.h new file mode 100644 index 00000000000..d005c3bb56c --- /dev/null +++ b/intern/cycles/device/cpu/kernel_thread_globals.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" + +CCL_NAMESPACE_BEGIN + +class Profiler; + +/* A special class which extends memory ownership of the `KernelGlobals` decoupling any resource + * which is not thread-safe for access. Every worker thread which needs to operate on + * `KernelGlobals` needs to initialize its own copy of this object. + * + * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that + * there is no unnecessary data duplication happening when using this object. */ +class CPUKernelThreadGlobals : public KernelGlobals { + public: + /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building + * without OSL support. Will avoid need to those unnamed pointers and casts. */ + CPUKernelThreadGlobals(const KernelGlobals &kernel_globals, + void *osl_globals_memory, + Profiler &cpu_profiler); + + ~CPUKernelThreadGlobals(); + + CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete; + CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept; + + CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete; + CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other); + + void start_profiling(); + void stop_profiling(); + + protected: + void reset_runtime_memory(); + + Profiler &cpu_profiler_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/cuda/device.cpp index 2e225ecfaf8..84becd6d081 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/cuda/device.cpp @@ -14,21 +14,25 @@ * limitations under the License. */ -#ifdef WITH_CUDA +#include "device/cuda/device.h" + +#include "util/util_logging.h" -# include "device/cuda/device_cuda.h" +#ifdef WITH_CUDA +# include "device/cuda/device_impl.h" # include "device/device.h" -# include "device/device_intern.h" -# include "util/util_logging.h" # include "util/util_string.h" # include "util/util_windows.h" +#endif /* WITH_CUDA */ CCL_NAMESPACE_BEGIN bool device_cuda_init() { -# ifdef WITH_CUDA_DYNLOAD +#if !defined(WITH_CUDA) + return false; +#elif defined(WITH_CUDA_DYNLOAD) static bool initialized = false; static bool result = false; @@ -59,16 +63,27 @@ bool device_cuda_init() } return result; -# else /* WITH_CUDA_DYNLOAD */ +#else /* WITH_CUDA_DYNLOAD */ return true; -# endif /* WITH_CUDA_DYNLOAD */ +#endif /* WITH_CUDA_DYNLOAD */ } -Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) +Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) { - return new CUDADevice(info, stats, profiler, background); +#ifdef WITH_CUDA + return new CUDADevice(info, stats, profiler); +#else + (void)info; + (void)stats; + (void)profiler; + + LOG(FATAL) << "Request to create CUDA device without compiled-in support. Should never happen."; + + return nullptr; +#endif } +#ifdef WITH_CUDA static CUresult device_cuda_safe_init() { # ifdef _WIN32 @@ -86,9 +101,11 @@ static CUresult device_cuda_safe_init() return cuInit(0); # endif } +#endif /* WITH_CUDA */ void device_cuda_info(vector<DeviceInfo> &devices) { +#ifdef WITH_CUDA CUresult result = device_cuda_safe_init(); if (result != CUDA_SUCCESS) { if (result != CUDA_ERROR_NO_DEVICE) @@ -129,9 +146,9 @@ void device_cuda_info(vector<DeviceInfo> &devices) info.has_half_images = (major >= 3); info.has_nanovdb = true; - info.has_volume_decoupled = false; - info.has_adaptive_stop_per_sample = false; - info.denoisers = DENOISER_NLM; + info.denoisers = 0; + + info.has_gpu_queue = true; /* Check if the device has P2P access to any other device in the system. */ for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) { @@ -182,10 +199,14 @@ void device_cuda_info(vector<DeviceInfo> &devices) if (!display_devices.empty()) devices.insert(devices.end(), display_devices.begin(), display_devices.end()); +#else /* WITH_CUDA */ + (void)devices; +#endif /* WITH_CUDA */ } string device_cuda_capabilities() { +#ifdef WITH_CUDA CUresult result = device_cuda_safe_init(); if (result != CUDA_SUCCESS) { if (result != CUDA_ERROR_NO_DEVICE) { @@ -310,8 +331,10 @@ string device_cuda_capabilities() } return capabilities; + +#else /* WITH_CUDA */ + return ""; +#endif /* WITH_CUDA */ } CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/cuda/device.h b/intern/cycles/device/cuda/device.h new file mode 100644 index 00000000000..b0484904d1a --- /dev/null +++ b/intern/cycles/device/cuda/device.h @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_string.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_cuda_init(); + +Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_cuda_info(vector<DeviceInfo> &devices); + +string device_cuda_capabilities(); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h deleted file mode 100644 index c3271c3cfcf..00000000000 --- a/intern/cycles/device/cuda/device_cuda.h +++ /dev/null @@ -1,270 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_CUDA - -# include "device/device.h" -# include "device/device_denoising.h" -# include "device/device_split_kernel.h" - -# include "util/util_map.h" -# include "util/util_task.h" - -# ifdef WITH_CUDA_DYNLOAD -# include "cuew.h" -# else -# include "util/util_opengl.h" -# include <cuda.h> -# include <cudaGL.h> -# endif - -CCL_NAMESPACE_BEGIN - -class CUDASplitKernel; - -class CUDADevice : public Device { - - friend class CUDASplitKernelFunction; - friend class CUDASplitKernel; - friend class CUDAContextScope; - - public: - DedicatedTaskPool task_pool; - CUdevice cuDevice; - CUcontext cuContext; - CUmodule cuModule, cuFilterModule; - size_t device_texture_headroom; - size_t device_working_headroom; - bool move_texture_to_host; - size_t map_host_used; - size_t map_host_limit; - int can_map_host; - int pitch_alignment; - int cuDevId; - int cuDevArchitecture; - bool first_error; - CUDASplitKernel *split_kernel; - - struct CUDAMem { - CUDAMem() : texobject(0), array(0), use_mapped_host(false) - { - } - - CUtexObject texobject; - CUarray array; - - /* If true, a mapped host memory in shared_pointer is being used. */ - bool use_mapped_host; - }; - typedef map<device_memory *, CUDAMem> CUDAMemMap; - CUDAMemMap cuda_mem_map; - thread_mutex cuda_mem_map_mutex; - - struct PixelMem { - GLuint cuPBO; - CUgraphicsResource cuPBOresource; - GLuint cuTexId; - int w, h; - }; - map<device_ptr, PixelMem> pixel_mem_map; - - /* Bindless Textures */ - device_vector<TextureInfo> texture_info; - bool need_texture_info; - - /* Kernels */ - struct { - bool loaded; - - CUfunction adaptive_stopping; - CUfunction adaptive_filter_x; - CUfunction adaptive_filter_y; - CUfunction adaptive_scale_samples; - int adaptive_num_threads_per_block; - } functions; - - static bool have_precompiled_kernels(); - - virtual bool show_samples() const override; - - virtual BVHLayoutMask get_bvh_layout_mask() const override; - - void set_error(const string &error) override; - - CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_); - - virtual ~CUDADevice(); - - bool support_device(const DeviceRequestedFeatures & /*requested_features*/); - - bool check_peer_access(Device *peer_device) override; - - bool use_adaptive_compilation(); - - bool use_split_kernel(); - - virtual string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false); - - string compile_kernel(const DeviceRequestedFeatures &requested_features, - const char *name, - const char *base = "cuda", - bool force_ptx = false); - - virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override; - - void load_functions(); - - void reserve_local_memory(const DeviceRequestedFeatures &requested_features); - - void init_host_memory(); - - void load_texture_info(); - - void move_textures_to_host(size_t size, bool for_texture); - - CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); - - void generic_copy_to(device_memory &mem); - - void generic_free(device_memory &mem); - - void mem_alloc(device_memory &mem) override; - - void mem_copy_to(device_memory &mem) override; - - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; - - void mem_zero(device_memory &mem) override; - - void mem_free(device_memory &mem) override; - - device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; - - virtual void const_copy_to(const char *name, void *host, size_t size) override; - - void global_alloc(device_memory &mem); - - void global_free(device_memory &mem); - - void tex_alloc(device_texture &mem); - - void tex_free(device_texture &mem); - - bool denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task); - - bool denoising_construct_transform(DenoisingTask *task); - - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task); - - bool denoising_solve(device_ptr output_ptr, DenoisingTask *task); - - bool denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task); - - bool denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task); - - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task); - - bool denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task); - - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task); - - void denoise(RenderTile &rtile, DenoisingTask &denoising); - - void adaptive_sampling_filter(uint filter_sample, - WorkTile *wtile, - CUdeviceptr d_wtile, - CUstream stream = 0); - void adaptive_sampling_post(RenderTile &rtile, - WorkTile *wtile, - CUdeviceptr d_wtile, - CUstream stream = 0); - - void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles); - - void film_convert(DeviceTask &task, - device_ptr buffer, - device_ptr rgba_byte, - device_ptr rgba_half); - - void shader(DeviceTask &task); - - CUdeviceptr map_pixels(device_ptr mem); - - void unmap_pixels(device_ptr mem); - - void pixels_alloc(device_memory &mem); - - void pixels_copy_from(device_memory &mem, int y, int w, int h); - - void pixels_free(device_memory &mem); - - void draw_pixels(device_memory &mem, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params) override; - - void thread_run(DeviceTask &task); - - virtual void task_add(DeviceTask &task) override; - - virtual void task_wait() override; - - virtual void task_cancel() override; -}; - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp deleted file mode 100644 index 2d2fcb38705..00000000000 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ /dev/null @@ -1,2714 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_CUDA - -# include <climits> -# include <limits.h> -# include <stdio.h> -# include <stdlib.h> -# include <string.h> - -# include "device/cuda/device_cuda.h" -# include "device/device_intern.h" -# include "device/device_split_kernel.h" - -# include "render/buffers.h" - -# include "kernel/filter/filter_defines.h" - -# include "util/util_debug.h" -# include "util/util_foreach.h" -# include "util/util_logging.h" -# include "util/util_map.h" -# include "util/util_md5.h" -# include "util/util_opengl.h" -# include "util/util_path.h" -# include "util/util_string.h" -# include "util/util_system.h" -# include "util/util_time.h" -# include "util/util_types.h" -# include "util/util_windows.h" - -# include "kernel/split/kernel_split_data_types.h" - -CCL_NAMESPACE_BEGIN - -# ifndef WITH_CUDA_DYNLOAD - -/* Transparently implement some functions, so majority of the file does not need - * to worry about difference between dynamically loaded and linked CUDA at all. - */ - -namespace { - -const char *cuewErrorString(CUresult result) -{ - /* We can only give error code here without major code duplication, that - * should be enough since dynamic loading is only being disabled by folks - * who knows what they're doing anyway. - * - * NOTE: Avoid call from several threads. - */ - static string error; - error = string_printf("%d", result); - return error.c_str(); -} - -const char *cuewCompilerPath() -{ - return CYCLES_CUDA_NVCC_EXECUTABLE; -} - -int cuewCompilerVersion() -{ - return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); -} - -} /* namespace */ -# endif /* WITH_CUDA_DYNLOAD */ - -class CUDADevice; - -class CUDASplitKernel : public DeviceSplitKernel { - CUDADevice *device; - - public: - explicit CUDASplitKernel(CUDADevice *device); - - virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &kernel_data_, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs); - - virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task); -}; - -/* Utility to push/pop CUDA context. */ -class CUDAContextScope { - public: - CUDAContextScope(CUDADevice *device); - ~CUDAContextScope(); - - private: - CUDADevice *device; -}; - -bool CUDADevice::have_precompiled_kernels() -{ - string cubins_path = path_get("lib"); - return path_exists(cubins_path); -} - -bool CUDADevice::show_samples() const -{ - /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ - return true; -} - -BVHLayoutMask CUDADevice::get_bvh_layout_mask() const -{ - return BVH_LAYOUT_BVH2; -} - -void CUDADevice::set_error(const string &error) -{ - Device::set_error(error); - - if (first_error) { - fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); - fprintf(stderr, - "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); - first_error = false; - } -} - -# define cuda_assert(stmt) \ - { \ - CUresult result = stmt; \ - if (result != CUDA_SUCCESS) { \ - const char *name = cuewErrorString(result); \ - set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \ - } \ - } \ - (void)0 - -CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) - : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL) -{ - first_error = true; - background = background_; - - cuDevId = info.num; - cuDevice = 0; - cuContext = 0; - - cuModule = 0; - cuFilterModule = 0; - - split_kernel = NULL; - - need_texture_info = false; - - device_texture_headroom = 0; - device_working_headroom = 0; - move_texture_to_host = false; - map_host_limit = 0; - map_host_used = 0; - can_map_host = 0; - pitch_alignment = 0; - - functions.loaded = false; - - /* Initialize CUDA. */ - CUresult result = cuInit(0); - if (result != CUDA_SUCCESS) { - set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result))); - return; - } - - /* Setup device and context. */ - result = cuDeviceGet(&cuDevice, cuDevId); - if (result != CUDA_SUCCESS) { - set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)", - cuewErrorString(result))); - return; - } - - /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. - * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, - * so we can predict which memory to map to host. */ - cuda_assert( - cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); - - cuda_assert(cuDeviceGetAttribute( - &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); - - unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; - if (can_map_host) { - ctx_flags |= CU_CTX_MAP_HOST; - init_host_memory(); - } - - /* Create context. */ - if (background) { - result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); - } - else { - result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice); - - if (result != CUDA_SUCCESS) { - result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); - background = true; - } - } - - if (result != CUDA_SUCCESS) { - set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result))); - return; - } - - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - cuDevArchitecture = major * 100 + minor * 10; - - /* Pop context set by cuCtxCreate. */ - cuCtxPopCurrent(NULL); -} - -CUDADevice::~CUDADevice() -{ - task_pool.cancel(); - - delete split_kernel; - - texture_info.free(); - - cuda_assert(cuCtxDestroy(cuContext)); -} - -bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/) -{ - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - - /* We only support sm_30 and above */ - if (major < 3) { - set_error(string_printf( - "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor)); - return false; - } - - return true; -} - -bool CUDADevice::check_peer_access(Device *peer_device) -{ - if (peer_device == this) { - return false; - } - if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) { - return false; - } - - CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device); - - int can_access = 0; - cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice)); - if (can_access == 0) { - return false; - } - - // Ensure array access over the link is possible as well (for 3D textures) - cuda_assert(cuDeviceGetP2PAttribute(&can_access, - CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED, - cuDevice, - peer_device_cuda->cuDevice)); - if (can_access == 0) { - return false; - } - - // Enable peer access in both directions - { - const CUDAContextScope scope(this); - CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0); - if (result != CUDA_SUCCESS) { - set_error(string_printf("Failed to enable peer access on CUDA context (%s)", - cuewErrorString(result))); - return false; - } - } - { - const CUDAContextScope scope(peer_device_cuda); - CUresult result = cuCtxEnablePeerAccess(cuContext, 0); - if (result != CUDA_SUCCESS) { - set_error(string_printf("Failed to enable peer access on CUDA context (%s)", - cuewErrorString(result))); - return false; - } - } - - return true; -} - -bool CUDADevice::use_adaptive_compilation() -{ - return DebugFlags().cuda.adaptive_compile; -} - -bool CUDADevice::use_split_kernel() -{ - return DebugFlags().cuda.split_kernel; -} - -/* Common NVCC flags which stays the same regardless of shading model, - * kernel sources md5 and only depends on compiler or compilation settings. - */ -string CUDADevice::compile_kernel_get_common_cflags( - const DeviceRequestedFeatures &requested_features, bool filter, bool split) -{ - const int machine = system_cpu_bits(); - const string source_path = path_get("source"); - const string include_path = source_path; - string cflags = string_printf( - "-m%d " - "--ptxas-options=\"-v\" " - "--use_fast_math " - "-DNVCC " - "-I\"%s\"", - machine, - include_path.c_str()); - if (!filter && use_adaptive_compilation()) { - cflags += " " + requested_features.get_build_options(); - } - const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); - if (extra_cflags) { - cflags += string(" ") + string(extra_cflags); - } - - if (split) { - cflags += " -D__SPLIT__"; - } - -# ifdef WITH_NANOVDB - cflags += " -DWITH_NANOVDB"; -# endif - - return cflags; -} - -string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features, - const char *name, - const char *base, - bool force_ptx) -{ - /* Compute kernel name. */ - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - - /* Attempt to use kernel provided with Blender. */ - if (!use_adaptive_compilation()) { - if (!force_ptx) { - const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); - VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; - if (path_exists(cubin)) { - VLOG(1) << "Using precompiled kernel."; - return cubin; - } - } - - /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ - int ptx_major = major, ptx_minor = minor; - while (ptx_major >= 3) { - const string ptx = path_get( - string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); - VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; - if (path_exists(ptx)) { - VLOG(1) << "Using precompiled kernel."; - return ptx; - } - - if (ptx_minor > 0) { - ptx_minor--; - } - else { - ptx_major--; - ptx_minor = 9; - } - } - } - - /* Try to use locally compiled kernel. */ - string source_path = path_get("source"); - const string source_md5 = path_files_md5_hash(source_path); - - /* We include cflags into md5 so changing cuda toolkit or changing other - * compiler command line arguments makes sure cubin gets re-built. - */ - string common_cflags = compile_kernel_get_common_cflags( - requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL); - const string kernel_md5 = util_md5_string(source_md5 + common_cflags); - - const char *const kernel_ext = force_ptx ? "ptx" : "cubin"; - const char *const kernel_arch = force_ptx ? "compute" : "sm"; - const string cubin_file = string_printf( - "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext); - const string cubin = path_cache_get(path_join("kernels", cubin_file)); - VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; - if (path_exists(cubin)) { - VLOG(1) << "Using locally compiled kernel."; - return cubin; - } - -# ifdef _WIN32 - if (!use_adaptive_compilation() && have_precompiled_kernels()) { - if (major < 3) { - set_error( - string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. " - "Your GPU is not supported.", - major, - minor)); - } - else { - set_error( - string_printf("CUDA binary kernel for this graphics card compute " - "capability (%d.%d) not found.", - major, - minor)); - } - return string(); - } -# endif - - /* Compile. */ - const char *const nvcc = cuewCompilerPath(); - if (nvcc == NULL) { - set_error( - "CUDA nvcc compiler not found. " - "Install CUDA toolkit in default location."); - return string(); - } - - const int nvcc_cuda_version = cuewCompilerVersion(); - VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << "."; - if (nvcc_cuda_version < 101) { - printf( - "Unsupported CUDA version %d.%d detected, " - "you need CUDA 10.1 or newer.\n", - nvcc_cuda_version / 10, - nvcc_cuda_version % 10); - return string(); - } - else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 || - nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) { - printf( - "CUDA version %d.%d detected, build may succeed but only " - "CUDA 10.1 to 11.4 are officially supported.\n", - nvcc_cuda_version / 10, - nvcc_cuda_version % 10); - } - - double starttime = time_dt(); - - path_create_directories(cubin); - - source_path = path_join(path_join(source_path, "kernel"), - path_join("kernels", path_join(base, string_printf("%s.cu", name)))); - - string command = string_printf( - "\"%s\" " - "-arch=%s_%d%d " - "--%s \"%s\" " - "-o \"%s\" " - "%s", - nvcc, - kernel_arch, - major, - minor, - kernel_ext, - source_path.c_str(), - cubin.c_str(), - common_cflags.c_str()); - - printf("Compiling CUDA kernel ...\n%s\n", command.c_str()); - -# ifdef _WIN32 - command = "call " + command; -# endif - if (system(command.c_str()) != 0) { - set_error( - "Failed to execute compilation command, " - "see console for details."); - return string(); - } - - /* Verify if compilation succeeded */ - if (!path_exists(cubin)) { - set_error( - "CUDA kernel compilation failed, " - "see console for details."); - return string(); - } - - printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); - - return cubin; -} - -bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features) -{ - /* TODO(sergey): Support kernels re-load for CUDA devices. - * - * Currently re-loading kernel will invalidate memory pointers, - * causing problems in cuCtxSynchronize. - */ - if (cuFilterModule && cuModule) { - VLOG(1) << "Skipping kernel reload, not currently supported."; - return true; - } - - /* check if cuda init succeeded */ - if (cuContext == 0) - return false; - - /* check if GPU is supported */ - if (!support_device(requested_features)) - return false; - - /* get kernel */ - const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel"; - string cubin = compile_kernel(requested_features, kernel_name); - if (cubin.empty()) - return false; - - const char *filter_name = "filter"; - string filter_cubin = compile_kernel(requested_features, filter_name); - if (filter_cubin.empty()) - return false; - - /* open module */ - CUDAContextScope scope(this); - - string cubin_data; - CUresult result; - - if (path_read_text(cubin, cubin_data)) - result = cuModuleLoadData(&cuModule, cubin_data.c_str()); - else - result = CUDA_ERROR_FILE_NOT_FOUND; - - if (result != CUDA_SUCCESS) - set_error(string_printf( - "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result))); - - if (path_read_text(filter_cubin, cubin_data)) - result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); - else - result = CUDA_ERROR_FILE_NOT_FOUND; - - if (result != CUDA_SUCCESS) - set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)", - filter_cubin.c_str(), - cuewErrorString(result))); - - if (result == CUDA_SUCCESS) { - reserve_local_memory(requested_features); - } - - load_functions(); - - return (result == CUDA_SUCCESS); -} - -void CUDADevice::load_functions() -{ - /* TODO: load all functions here. */ - if (functions.loaded) { - return; - } - functions.loaded = true; - - cuda_assert(cuModuleGetFunction( - &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping")); - cuda_assert(cuModuleGetFunction( - &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x")); - cuda_assert(cuModuleGetFunction( - &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y")); - cuda_assert(cuModuleGetFunction( - &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples")); - - cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1)); - - int unused_min_blocks; - cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks, - &functions.adaptive_num_threads_per_block, - functions.adaptive_scale_samples, - NULL, - 0, - 0)); -} - -void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features) -{ - if (use_split_kernel()) { - /* Split kernel mostly uses global memory and adaptive compilation, - * difficult to predict how much is needed currently. */ - return; - } - - /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory - * needed for kernel launches, so that we can reliably figure out when - * to allocate scene data in mapped host memory. */ - CUDAContextScope scope(this); - - size_t total = 0, free_before = 0, free_after = 0; - cuMemGetInfo(&free_before, &total); - - /* Get kernel function. */ - CUfunction cuRender; - - if (requested_features.use_baking) { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); - } - else if (requested_features.use_integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); - } - - cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); - - int min_blocks, num_threads_per_block; - cuda_assert( - cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); - - /* Launch kernel, using just 1 block appears sufficient to reserve - * memory for all multiprocessors. It would be good to do this in - * parallel for the multi GPU case still to make it faster. */ - CUdeviceptr d_work_tiles = 0; - uint total_work_size = 0; - - void *args[] = {&d_work_tiles, &total_work_size}; - - cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); - - cuda_assert(cuCtxSynchronize()); - - cuMemGetInfo(&free_after, &total); - VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) - << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; - -# if 0 - /* For testing mapped host memory, fill up device memory. */ - const size_t keep_mb = 1024; - - while (free_after > keep_mb * 1024 * 1024LL) { - CUdeviceptr tmp; - cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); - cuMemGetInfo(&free_after, &total); - } -# endif -} - -void CUDADevice::init_host_memory() -{ - /* Limit amount of host mapped memory, because allocating too much can - * cause system instability. Leave at least half or 4 GB of system - * memory free, whichever is smaller. */ - size_t default_limit = 4 * 1024 * 1024 * 1024LL; - size_t system_ram = system_physical_ram(); - - if (system_ram > 0) { - if (system_ram / 2 > default_limit) { - map_host_limit = system_ram - default_limit; - } - else { - map_host_limit = system_ram / 2; - } - } - else { - VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; - map_host_limit = 0; - } - - /* Amount of device memory to keep is free after texture memory - * and working memory allocations respectively. We set the working - * memory limit headroom lower so that some space is left after all - * texture memory allocations. */ - device_working_headroom = 32 * 1024 * 1024LL; // 32MB - device_texture_headroom = 128 * 1024 * 1024LL; // 128MB - - VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) - << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; -} - -void CUDADevice::load_texture_info() -{ - if (need_texture_info) { - /* Unset flag before copying, so this does not loop indefinitely if the copy below calls - * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ - need_texture_info = false; - texture_info.copy_to_device(); - } -} - -void CUDADevice::move_textures_to_host(size_t size, bool for_texture) -{ - /* Break out of recursive call, which can happen when moving memory on a multi device. */ - static bool any_device_moving_textures_to_host = false; - if (any_device_moving_textures_to_host) { - return; - } - - /* Signal to reallocate textures in host memory only. */ - move_texture_to_host = true; - - while (size > 0) { - /* Find suitable memory allocation to move. */ - device_memory *max_mem = NULL; - size_t max_size = 0; - bool max_is_image = false; - - thread_scoped_lock lock(cuda_mem_map_mutex); - foreach (CUDAMemMap::value_type &pair, cuda_mem_map) { - device_memory &mem = *pair.first; - CUDAMem *cmem = &pair.second; - - /* Can only move textures allocated on this device (and not those from peer devices). - * And need to ignore memory that is already on the host. */ - if (!mem.is_resident(this) || cmem->use_mapped_host) { - continue; - } - - bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && - (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - /* Can't move this type of memory. */ - if (!is_texture || cmem->array) { - continue; - } - - /* For other textures, only move image textures. */ - if (for_texture && !is_image) { - continue; - } - - /* Try to move largest allocation, prefer moving images. */ - if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { - max_is_image = is_image; - max_size = mem.device_size; - max_mem = &mem; - } - } - lock.unlock(); - - /* Move to host memory. This part is mutex protected since - * multiple CUDA devices could be moving the memory. The - * first one will do it, and the rest will adopt the pointer. */ - if (max_mem) { - VLOG(1) << "Move memory from device to host: " << max_mem->name; - - static thread_mutex move_mutex; - thread_scoped_lock lock(move_mutex); - - any_device_moving_textures_to_host = true; - - /* Potentially need to call back into multi device, so pointer mapping - * and peer devices are updated. This is also necessary since the device - * pointer may just be a key here, so cannot be accessed and freed directly. - * Unfortunately it does mean that memory is reallocated on all other - * devices as well, which is potentially dangerous when still in use (since - * a thread rendering on another devices would only be caught in this mutex - * if it so happens to do an allocation at the same time as well. */ - max_mem->device_copy_to(); - size = (max_size >= size) ? 0 : size - max_size; - - any_device_moving_textures_to_host = false; - } - else { - break; - } - } - - /* Unset flag before texture info is reloaded, since it should stay in device memory. */ - move_texture_to_host = false; - - /* Update texture info array with new pointers. */ - load_texture_info(); -} - -CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding) -{ - CUDAContextScope scope(this); - - CUdeviceptr device_pointer = 0; - size_t size = mem.memory_size() + pitch_padding; - - CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; - const char *status = ""; - - /* First try allocating in device memory, respecting headroom. We make - * an exception for texture info. It is small and frequently accessed, - * so treat it as working memory. - * - * If there is not enough room for working memory, we will try to move - * textures to host memory, assuming the performance impact would have - * been worse for working memory. */ - bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; - - size_t total = 0, free = 0; - cuMemGetInfo(&free, &total); - - /* Move textures to host memory if needed. */ - if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { - move_textures_to_host(size + headroom - free, is_texture); - cuMemGetInfo(&free, &total); - } - - /* Allocate in device memory. */ - if (!move_texture_to_host && (size + headroom) < free) { - mem_alloc_result = cuMemAlloc(&device_pointer, size); - if (mem_alloc_result == CUDA_SUCCESS) { - status = " in device memory"; - } - } - - /* Fall back to mapped host memory if needed and possible. */ - - void *shared_pointer = 0; - - if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) { - if (mem.shared_pointer) { - /* Another device already allocated host memory. */ - mem_alloc_result = CUDA_SUCCESS; - shared_pointer = mem.shared_pointer; - } - else if (map_host_used + size < map_host_limit) { - /* Allocate host memory ourselves. */ - mem_alloc_result = cuMemHostAlloc( - &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); - - assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || - (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); - } - - if (mem_alloc_result == CUDA_SUCCESS) { - cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0)); - map_host_used += size; - status = " in host memory"; - } - } - - if (mem_alloc_result != CUDA_SUCCESS) { - if (mem.type == MEM_DEVICE_ONLY) { - status = " failed, out of device memory"; - set_error("System is out of GPU memory"); - } - else { - status = " failed, out of device and host memory"; - set_error("System is out of GPU and shared host memory"); - } - } - - if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" << status; - } - - mem.device_pointer = (device_ptr)device_pointer; - mem.device_size = size; - stats.mem_alloc(size); - - if (!mem.device_pointer) { - return NULL; - } - - /* Insert into map of allocations. */ - thread_scoped_lock lock(cuda_mem_map_mutex); - CUDAMem *cmem = &cuda_mem_map[&mem]; - if (shared_pointer != 0) { - /* Replace host pointer with our host allocation. Only works if - * CUDA memory layout is the same and has no pitch padding. Also - * does not work if we move textures to host during a render, - * since other devices might be using the memory. */ - - if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && - mem.host_pointer != shared_pointer) { - memcpy(shared_pointer, mem.host_pointer, size); - - /* A Call to device_memory::host_free() should be preceded by - * a call to device_memory::device_free() for host memory - * allocated by a device to be handled properly. Two exceptions - * are here and a call in OptiXDevice::generic_alloc(), where - * the current host memory can be assumed to be allocated by - * device_memory::host_alloc(), not by a device */ - - mem.host_free(); - mem.host_pointer = shared_pointer; - } - mem.shared_pointer = shared_pointer; - mem.shared_counter++; - cmem->use_mapped_host = true; - } - else { - cmem->use_mapped_host = false; - } - - return cmem; -} - -void CUDADevice::generic_copy_to(device_memory &mem) -{ - if (!mem.host_pointer || !mem.device_pointer) { - return; - } - - /* If use_mapped_host of mem is false, the current device only uses device memory allocated by - * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from - * mem.host_pointer. */ - thread_scoped_lock lock(cuda_mem_map_mutex); - if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { - const CUDAContextScope scope(this); - cuda_assert( - cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size())); - } -} - -void CUDADevice::generic_free(device_memory &mem) -{ - if (mem.device_pointer) { - CUDAContextScope scope(this); - thread_scoped_lock lock(cuda_mem_map_mutex); - const CUDAMem &cmem = cuda_mem_map[&mem]; - - /* If cmem.use_mapped_host is true, reference counting is used - * to safely free a mapped host memory. */ - - if (cmem.use_mapped_host) { - assert(mem.shared_pointer); - if (mem.shared_pointer) { - assert(mem.shared_counter > 0); - if (--mem.shared_counter == 0) { - if (mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = 0; - } - cuMemFreeHost(mem.shared_pointer); - mem.shared_pointer = 0; - } - } - map_host_used -= mem.device_size; - } - else { - /* Free device memory. */ - cuda_assert(cuMemFree(mem.device_pointer)); - } - - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } -} - -void CUDADevice::mem_alloc(device_memory &mem) -{ - if (mem.type == MEM_PIXELS && !background) { - pixels_alloc(mem); - } - else if (mem.type == MEM_TEXTURE) { - assert(!"mem_alloc not supported for textures."); - } - else if (mem.type == MEM_GLOBAL) { - assert(!"mem_alloc not supported for global memory."); - } - else { - generic_alloc(mem); - } -} - -void CUDADevice::mem_copy_to(device_memory &mem) -{ - if (mem.type == MEM_PIXELS) { - assert(!"mem_copy_to not supported for pixels."); - } - else if (mem.type == MEM_GLOBAL) { - global_free(mem); - global_alloc(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - tex_alloc((device_texture &)mem); - } - else { - if (!mem.device_pointer) { - generic_alloc(mem); - } - generic_copy_to(mem); - } -} - -void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) -{ - if (mem.type == MEM_PIXELS && !background) { - pixels_copy_from(mem, y, w, h); - } - else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { - assert(!"mem_copy_from not supported for textures."); - } - else if (mem.host_pointer) { - const size_t size = elem * w * h; - const size_t offset = elem * y * w; - - if (mem.device_pointer) { - const CUDAContextScope scope(this); - cuda_assert(cuMemcpyDtoH( - (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); - } - else { - memset((char *)mem.host_pointer + offset, 0, size); - } - } -} - -void CUDADevice::mem_zero(device_memory &mem) -{ - if (!mem.device_pointer) { - mem_alloc(mem); - } - if (!mem.device_pointer) { - return; - } - - /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory - * regardless of mem.host_pointer and mem.shared_pointer. */ - thread_scoped_lock lock(cuda_mem_map_mutex); - if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { - const CUDAContextScope scope(this); - cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); - } - else if (mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } -} - -void CUDADevice::mem_free(device_memory &mem) -{ - if (mem.type == MEM_PIXELS && !background) { - pixels_free(mem); - } - else if (mem.type == MEM_GLOBAL) { - global_free(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - } - else { - generic_free(mem); - } -} - -device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) -{ - return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); -} - -void CUDADevice::const_copy_to(const char *name, void *host, size_t size) -{ - CUDAContextScope scope(this); - CUdeviceptr mem; - size_t bytes; - - cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); - // assert(bytes == size); - cuda_assert(cuMemcpyHtoD(mem, host, size)); -} - -void CUDADevice::global_alloc(device_memory &mem) -{ - if (mem.is_resident(this)) { - generic_alloc(mem); - generic_copy_to(mem); - } - - const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); -} - -void CUDADevice::global_free(device_memory &mem) -{ - if (mem.is_resident(this) && mem.device_pointer) { - generic_free(mem); - } -} - -void CUDADevice::tex_alloc(device_texture &mem) -{ - CUDAContextScope scope(this); - - /* General variables for both architectures */ - string bind_name = mem.name; - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - - CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch (mem.info.extension) { - case EXTENSION_REPEAT: - address_mode = CU_TR_ADDRESS_MODE_WRAP; - break; - case EXTENSION_EXTEND: - address_mode = CU_TR_ADDRESS_MODE_CLAMP; - break; - case EXTENSION_CLIP: - address_mode = CU_TR_ADDRESS_MODE_BORDER; - break; - default: - assert(0); - break; - } - - CUfilter_mode filter_mode; - if (mem.info.interpolation == INTERPOLATION_CLOSEST) { - filter_mode = CU_TR_FILTER_MODE_POINT; - } - else { - filter_mode = CU_TR_FILTER_MODE_LINEAR; - } - - /* Image Texture Storage */ - CUarray_format_enum format; - switch (mem.data_type) { - case TYPE_UCHAR: - format = CU_AD_FORMAT_UNSIGNED_INT8; - break; - case TYPE_UINT16: - format = CU_AD_FORMAT_UNSIGNED_INT16; - break; - case TYPE_UINT: - format = CU_AD_FORMAT_UNSIGNED_INT32; - break; - case TYPE_INT: - format = CU_AD_FORMAT_SIGNED_INT32; - break; - case TYPE_FLOAT: - format = CU_AD_FORMAT_FLOAT; - break; - case TYPE_HALF: - format = CU_AD_FORMAT_HALF; - break; - default: - assert(0); - return; - } - - CUDAMem *cmem = NULL; - CUarray array_3d = NULL; - size_t src_pitch = mem.data_width * dsize * mem.data_elements; - size_t dst_pitch = src_pitch; - - if (!mem.is_resident(this)) { - thread_scoped_lock lock(cuda_mem_map_mutex); - cmem = &cuda_mem_map[&mem]; - cmem->texobject = 0; - - if (mem.data_depth > 1) { - array_3d = (CUarray)mem.device_pointer; - cmem->array = array_3d; - } - else if (mem.data_height > 0) { - dst_pitch = align_up(src_pitch, pitch_alignment); - } - } - else if (mem.data_depth > 1) { - /* 3D texture using array, there is no API for linear memory. */ - CUDA_ARRAY3D_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; - - VLOG(1) << "Array 3D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - cuda_assert(cuArray3DCreate(&array_3d, &desc)); - - if (!array_3d) { - return; - } - - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = array_3d; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); - - mem.device_pointer = (device_ptr)array_3d; - mem.device_size = size; - stats.mem_alloc(size); - - thread_scoped_lock lock(cuda_mem_map_mutex); - cmem = &cuda_mem_map[&mem]; - cmem->texobject = 0; - cmem->array = array_3d; - } - else if (mem.data_height > 0) { - /* 2D texture, using pitch aligned linear memory. */ - dst_pitch = align_up(src_pitch, pitch_alignment); - size_t dst_size = dst_pitch * mem.data_height; - - cmem = generic_alloc(mem, dst_size - mem.memory_size()); - if (!cmem) { - return; - } - - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_DEVICE; - param.dstDevice = mem.device_pointer; - param.dstPitch = dst_pitch; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - cuda_assert(cuMemcpy2DUnaligned(¶m)); - } - else { - /* 1D texture, using linear memory. */ - cmem = generic_alloc(mem); - if (!cmem) { - return; - } - - cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); - } - - /* Resize once */ - const uint slot = mem.slot; - if (slot >= texture_info.size()) { - /* Allocate some slots in advance, to reduce amount - * of re-allocations. */ - texture_info.resize(slot + 128); - } - - /* Set Mapping and tag that we need to (re-)upload to device */ - texture_info[slot] = mem.info; - need_texture_info = true; - - if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && - mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { - /* Kepler+, bindless textures. */ - CUDA_RESOURCE_DESC resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - - if (array_3d) { - resDesc.resType = CU_RESOURCE_TYPE_ARRAY; - resDesc.res.array.hArray = array_3d; - resDesc.flags = 0; - } - else if (mem.data_height > 0) { - resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; - resDesc.res.pitch2D.devPtr = mem.device_pointer; - resDesc.res.pitch2D.format = format; - resDesc.res.pitch2D.numChannels = mem.data_elements; - resDesc.res.pitch2D.height = mem.data_height; - resDesc.res.pitch2D.width = mem.data_width; - resDesc.res.pitch2D.pitchInBytes = dst_pitch; - } - else { - resDesc.resType = CU_RESOURCE_TYPE_LINEAR; - resDesc.res.linear.devPtr = mem.device_pointer; - resDesc.res.linear.format = format; - resDesc.res.linear.numChannels = mem.data_elements; - resDesc.res.linear.sizeInBytes = mem.device_size; - } - - CUDA_TEXTURE_DESC texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = address_mode; - texDesc.addressMode[1] = address_mode; - texDesc.addressMode[2] = address_mode; - texDesc.filterMode = filter_mode; - texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - - thread_scoped_lock lock(cuda_mem_map_mutex); - cmem = &cuda_mem_map[&mem]; - - cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); - - texture_info[slot].data = (uint64_t)cmem->texobject; - } - else { - texture_info[slot].data = (uint64_t)mem.device_pointer; - } -} - -void CUDADevice::tex_free(device_texture &mem) -{ - if (mem.device_pointer) { - CUDAContextScope scope(this); - thread_scoped_lock lock(cuda_mem_map_mutex); - const CUDAMem &cmem = cuda_mem_map[&mem]; - - if (cmem.texobject) { - /* Free bindless texture. */ - cuTexObjectDestroy(cmem.texobject); - } - - if (!mem.is_resident(this)) { - /* Do not free memory here, since it was allocated on a different device. */ - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - else if (cmem.array) { - /* Free array. */ - cuArrayDestroy(cmem.array); - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - else { - lock.unlock(); - generic_free(mem); - } - } -} - -# define CUDA_GET_BLOCKSIZE(func, w, h) \ - int threads_per_block; \ - cuda_assert( \ - cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int threads = (int)sqrt((float)threads_per_block); \ - int xblocks = ((w) + threads - 1) / threads; \ - int yblocks = ((h) + threads - 1) / threads; - -# define CUDA_LAUNCH_KERNEL(func, args) \ - cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); - -/* Similar as above, but for 1-dimensional blocks. */ -# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ - int threads_per_block; \ - cuda_assert( \ - cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \ - int yblocks = h; - -# define CUDA_LAUNCH_KERNEL_1D(func, args) \ - cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0)); - -bool CUDADevice::denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; - int frame_offset = 0; - - if (have_error()) - return false; - - CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer; - CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; - CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts; - CUdeviceptr scale_ptr = 0; - - cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride)); - cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride)); - - { - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; - cuda_assert(cuModuleGetFunction( - &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction( - &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction( - &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts); - - void *calc_difference_args[] = {&guide_ptr, - &variance_ptr, - &scale_ptr, - &difference, - &w, - &h, - &stride, - &pass_stride, - &r, - &channel_offset, - &frame_offset, - &a, - &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = { - &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *update_output_args[] = {&blurDifference, - &image_ptr, - &out_ptr, - &weightAccum, - &w, - &h, - &stride, - &pass_stride, - &channel_offset, - &r, - &f}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); - } - - { - CUfunction cuNLMNormalize; - cuda_assert( - cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); - cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); - void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; - CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); - CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); - cuda_assert(cuCtxSynchronize()); - } - - return !have_error(); -} - -bool CUDADevice::denoising_construct_transform(DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterConstructTransform; - cuda_assert(cuModuleGetFunction( - &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); - cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); - CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h); - - void *args[] = {&task->buffer.mem.device_pointer, - &task->tile_info_mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->filter_area, - &task->rect, - &task->radius, - &task->pca_threshold, - &task->buffer.pass_stride, - &task->buffer.frame_stride, - &task->buffer.use_time}; - CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - int r = task->radius; - int f = 4; - float a = 1.0f; - float k_2 = task->nlm_k_2; - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - - if (have_error()) - return false; - - CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer; - CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; - - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; - cuda_assert(cuModuleGetFunction( - &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert( - cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction( - &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, - task->reconstruction_state.source_w * task->reconstruction_state.source_h, - num_shifts); - - void *calc_difference_args[] = {&color_ptr, - &color_variance_ptr, - &scale_ptr, - &difference, - &w, - &h, - &stride, - &pass_stride, - &r, - &pass_stride, - &frame_offset, - &a, - &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&t, - &blurDifference, - &task->buffer.mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->reconstruction_state.filter_window, - &w, - &h, - &stride, - &pass_stride, - &r, - &f, - &frame_offset, - &task->buffer.use_time}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task) -{ - CUfunction cuFinalize; - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); - void *finalize_args[] = {&output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_GET_BLOCKSIZE( - cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h); - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterCombineHalves; - cuda_assert(cuModuleGetFunction( - &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); - cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r}; - CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterDivideShadow; - cuda_assert(cuModuleGetFunction( - &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); - cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &a_ptr, - &b_ptr, - &sample_variance_ptr, - &sv_variance_ptr, - &buffer_variance_ptr, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterGetFeature; - cuda_assert( - cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); - cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &mean_offset, - &variance_offset, - &mean_ptr, - &variance_ptr, - &scale, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterWriteFeature; - cuda_assert(cuModuleGetFunction( - &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); - cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w); - - void *args[] = {&task->render_buffer.samples, - &task->reconstruction_state.buffer_params, - &task->filter_area, - &from_ptr, - &buffer_ptr, - &out_offset, - &task->rect}; - CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterDetectOutliers; - cuda_assert(cuModuleGetFunction( - &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); - cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = { - &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride}; - - CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising) -{ - denoising.functions.construct_transform = function_bind( - &CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind( - &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind( - &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind( - &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind( - &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind( - &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind( - &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind( - &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(rtile); -} - -void CUDADevice::adaptive_sampling_filter(uint filter_sample, - WorkTile *wtile, - CUdeviceptr d_wtile, - CUstream stream) -{ - const int num_threads_per_block = functions.adaptive_num_threads_per_block; - - /* These are a series of tiny kernels because there is no grid synchronization - * from within a kernel, so multiple kernel launches it is. */ - uint total_work_size = wtile->h * wtile->w; - void *args2[] = {&d_wtile, &filter_sample, &total_work_size}; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuLaunchKernel(functions.adaptive_stopping, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - 0, - stream, - args2, - 0)); - total_work_size = wtile->h; - num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuLaunchKernel(functions.adaptive_filter_x, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - 0, - stream, - args2, - 0)); - total_work_size = wtile->w; - num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuLaunchKernel(functions.adaptive_filter_y, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - 0, - stream, - args2, - 0)); -} - -void CUDADevice::adaptive_sampling_post(RenderTile &rtile, - WorkTile *wtile, - CUdeviceptr d_wtile, - CUstream stream) -{ - const int num_threads_per_block = functions.adaptive_num_threads_per_block; - uint total_work_size = wtile->h * wtile->w; - - void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size}; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - 0, - stream, - args, - 0)); -} - -void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles) -{ - scoped_timer timer(&rtile.buffers->render_time); - - if (have_error()) - return; - - CUDAContextScope scope(this); - CUfunction cuRender; - - /* Get kernel function. */ - if (rtile.task == RenderTile::BAKE) { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); - } - else if (task.integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); - } - - if (have_error()) { - return; - } - - cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); - - /* Allocate work tile. */ - work_tiles.alloc(1); - - WorkTile *wtile = work_tiles.data(); - wtile->x = rtile.x; - wtile->y = rtile.y; - wtile->w = rtile.w; - wtile->h = rtile.h; - wtile->offset = rtile.offset; - wtile->stride = rtile.stride; - wtile->buffer = (float *)(CUdeviceptr)rtile.buffer; - - /* Prepare work size. More step samples render faster, but for now we - * remain conservative for GPUs connected to a display to avoid driver - * timeouts and display freezing. */ - int min_blocks, num_threads_per_block; - cuda_assert( - cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); - if (!info.display_device) { - min_blocks *= 8; - } - - uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); - - /* Render all samples. */ - int start_sample = rtile.start_sample; - int end_sample = rtile.start_sample + rtile.num_samples; - - for (int sample = start_sample; sample < end_sample;) { - /* Setup and copy work tile to device. */ - wtile->start_sample = sample; - wtile->num_samples = step_samples; - if (task.adaptive_sampling.use) { - wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); - } - wtile->num_samples = min(wtile->num_samples, end_sample - sample); - work_tiles.copy_to_device(); - - CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; - uint total_work_size = wtile->w * wtile->h * wtile->num_samples; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - - /* Launch kernel. */ - void *args[] = {&d_work_tiles, &total_work_size}; - - cuda_assert( - cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); - - /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ - uint filter_sample = sample + wtile->num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); - } - - cuda_assert(cuCtxSynchronize()); - - /* Update progress. */ - sample += wtile->num_samples; - rtile.sample = sample; - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - /* Finalize adaptive sampling. */ - if (task.adaptive_sampling.use) { - CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; - adaptive_sampling_post(rtile, wtile, d_work_tiles); - cuda_assert(cuCtxSynchronize()); - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - } -} - -void CUDADevice::film_convert(DeviceTask &task, - device_ptr buffer, - device_ptr rgba_byte, - device_ptr rgba_half) -{ - if (have_error()) - return; - - CUDAContextScope scope(this); - - CUfunction cuFilmConvert; - CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half); - CUdeviceptr d_buffer = (CUdeviceptr)buffer; - - /* get kernel function */ - if (rgba_half) { - cuda_assert( - cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float")); - } - else { - cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")); - } - - float sample_scale = 1.0f / (task.sample + 1); - - /* pass in parameters */ - void *args[] = {&d_rgba, - &d_buffer, - &sample_scale, - &task.x, - &task.y, - &task.w, - &task.h, - &task.offset, - &task.stride}; - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute( - &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert)); - - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - int xblocks = (task.w + xthreads - 1) / xthreads; - int yblocks = (task.h + ythreads - 1) / ythreads; - - cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1)); - - cuda_assert(cuLaunchKernel(cuFilmConvert, - xblocks, - yblocks, - 1, /* blocks */ - xthreads, - ythreads, - 1, /* threads */ - 0, - 0, - args, - 0)); - - unmap_pixels((rgba_byte) ? rgba_byte : rgba_half); - - cuda_assert(cuCtxSynchronize()); -} - -void CUDADevice::shader(DeviceTask &task) -{ - if (have_error()) - return; - - CUDAContextScope scope(this); - - CUfunction cuShader; - CUdeviceptr d_input = (CUdeviceptr)task.shader_input; - CUdeviceptr d_output = (CUdeviceptr)task.shader_output; - - /* get kernel function */ - if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); - } - - /* do tasks in smaller chunks, so we can cancel it */ - const int shader_chunk_size = 65536; - const int start = task.shader_x; - const int end = task.shader_x + task.shader_w; - int offset = task.offset; - - bool canceled = false; - for (int sample = 0; sample < task.num_samples && !canceled; sample++) { - for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) { - int shader_w = min(shader_chunk_size, end - shader_x); - - /* pass in parameters */ - void *args[8]; - int arg = 0; - args[arg++] = &d_input; - args[arg++] = &d_output; - args[arg++] = &task.shader_eval_type; - if (task.shader_eval_type >= SHADER_EVAL_BAKE) { - args[arg++] = &task.shader_filter; - } - args[arg++] = &shader_x; - args[arg++] = &shader_w; - args[arg++] = &offset; - args[arg++] = &sample; - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute( - &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); - - int xblocks = (shader_w + threads_per_block - 1) / threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuLaunchKernel(cuShader, - xblocks, - 1, - 1, /* blocks */ - threads_per_block, - 1, - 1, /* threads */ - 0, - 0, - args, - 0)); - - cuda_assert(cuCtxSynchronize()); - - if (task.get_cancel()) { - canceled = true; - break; - } - } - - task.update_progress(NULL); - } -} - -CUdeviceptr CUDADevice::map_pixels(device_ptr mem) -{ - if (!background) { - PixelMem pmem = pixel_mem_map[mem]; - CUdeviceptr buffer; - - size_t bytes; - cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0)); - cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource)); - - return buffer; - } - - return (CUdeviceptr)mem; -} - -void CUDADevice::unmap_pixels(device_ptr mem) -{ - if (!background) { - PixelMem pmem = pixel_mem_map[mem]; - - cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0)); - } -} - -void CUDADevice::pixels_alloc(device_memory &mem) -{ - PixelMem pmem; - - pmem.w = mem.data_width; - pmem.h = mem.data_height; - - CUDAContextScope scope(this); - - glGenBuffers(1, &pmem.cuPBO); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - if (mem.data_type == TYPE_HALF) - glBufferData( - GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW); - else - glBufferData( - GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - glActiveTexture(GL_TEXTURE0); - glGenTextures(1, &pmem.cuTexId); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if (mem.data_type == TYPE_HALF) - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); - else - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); - - CUresult result = cuGraphicsGLRegisterBuffer( - &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); - - if (result == CUDA_SUCCESS) { - mem.device_pointer = pmem.cuTexId; - pixel_mem_map[mem.device_pointer] = pmem; - - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - - return; - } - else { - /* failed to register buffer, fallback to no interop */ - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - background = true; - } -} - -void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h) -{ - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); - size_t offset = sizeof(uchar) * 4 * y * w; - memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h); - glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); -} - -void CUDADevice::pixels_free(device_memory &mem) -{ - if (mem.device_pointer) { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; - } -} - -void CUDADevice::draw_pixels(device_memory &mem, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params) -{ - assert(mem.type == MEM_PIXELS); - - if (!background) { - const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - float *vpointer; - - CUDAContextScope scope(this); - - /* for multi devices, this assumes the inefficient method that we allocate - * all pixels on the device even though we only render to a subset */ - size_t offset = 4 * y * w; - - if (mem.data_type == TYPE_HALF) - offset *= sizeof(GLhalf); - else - offset *= sizeof(uint8_t); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if (mem.data_type == TYPE_HALF) { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset); - } - else { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset); - } - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - if (transparent) { - glEnable(GL_BLEND); - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - } - - GLint shader_program; - if (use_fallback_shader) { - if (!bind_fallback_display_space_shader(dw, dh)) { - return; - } - shader_program = fallback_shader_program; - } - else { - draw_params.bind_display_space_shader_cb(); - glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); - } - - if (!vertex_buffer) { - glGenBuffers(1, &vertex_buffer); - } - - glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); - /* invalidate old contents - - * avoids stalling if buffer is still waiting in queue to be rendered */ - glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); - - vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); - - if (vpointer) { - /* texture coordinate - vertex pair */ - vpointer[0] = 0.0f; - vpointer[1] = 0.0f; - vpointer[2] = dx; - vpointer[3] = dy; - - vpointer[4] = (float)w / (float)pmem.w; - vpointer[5] = 0.0f; - vpointer[6] = (float)width + dx; - vpointer[7] = dy; - - vpointer[8] = (float)w / (float)pmem.w; - vpointer[9] = (float)h / (float)pmem.h; - vpointer[10] = (float)width + dx; - vpointer[11] = (float)height + dy; - - vpointer[12] = 0.0f; - vpointer[13] = (float)h / (float)pmem.h; - vpointer[14] = dx; - vpointer[15] = (float)height + dy; - - glUnmapBuffer(GL_ARRAY_BUFFER); - } - - GLuint vertex_array_object; - GLuint position_attribute, texcoord_attribute; - - glGenVertexArrays(1, &vertex_array_object); - glBindVertexArray(vertex_array_object); - - texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); - position_attribute = glGetAttribLocation(shader_program, "pos"); - - glEnableVertexAttribArray(texcoord_attribute); - glEnableVertexAttribArray(position_attribute); - - glVertexAttribPointer( - texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); - glVertexAttribPointer(position_attribute, - 2, - GL_FLOAT, - GL_FALSE, - 4 * sizeof(float), - (const GLvoid *)(sizeof(float) * 2)); - - glDrawArrays(GL_TRIANGLE_FAN, 0, 4); - - if (use_fallback_shader) { - glUseProgram(0); - } - else { - draw_params.unbind_display_space_shader_cb(); - } - - if (transparent) { - glDisable(GL_BLEND); - } - - glBindTexture(GL_TEXTURE_2D, 0); - - return; - } - - Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); -} - -void CUDADevice::thread_run(DeviceTask &task) -{ - CUDAContextScope scope(this); - - if (task.type == DeviceTask::RENDER) { - DeviceRequestedFeatures requested_features; - if (use_split_kernel()) { - if (split_kernel == NULL) { - split_kernel = new CUDASplitKernel(this); - split_kernel->load_kernels(requested_features); - } - } - - device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - - /* keep rendering tiles until done */ - RenderTile tile; - DenoisingTask denoising(this, task); - - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - if (use_split_kernel()) { - device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(task, tile, void_buffer, void_buffer); - } - else { - render(task, tile, work_tiles); - } - } - else if (tile.task == RenderTile::BAKE) { - render(task, tile, work_tiles); - } - else if (tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - - denoise(tile, denoising); - - task.update_progress(&tile, tile.w * tile.h); - } - - task.release_tile(tile); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - work_tiles.free(); - } - else if (task.type == DeviceTask::SHADER) { - shader(task); - - cuda_assert(cuCtxSynchronize()); - } - else if (task.type == DeviceTask::DENOISE_BUFFER) { - RenderTile tile; - tile.x = task.x; - tile.y = task.y; - tile.w = task.w; - tile.h = task.h; - tile.buffer = task.buffer; - tile.sample = task.sample + task.num_samples; - tile.num_samples = task.num_samples; - tile.start_sample = task.sample; - tile.offset = task.offset; - tile.stride = task.stride; - tile.buffers = task.buffers; - - DenoisingTask denoising(this, task); - denoise(tile, denoising); - task.update_progress(&tile, tile.w * tile.h); - } -} - -void CUDADevice::task_add(DeviceTask &task) -{ - CUDAContextScope scope(this); - - /* Load texture info. */ - load_texture_info(); - - /* Synchronize all memory copies before executing task. */ - cuda_assert(cuCtxSynchronize()); - - if (task.type == DeviceTask::FILM_CONVERT) { - /* must be done in main thread due to opengl access */ - film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - } - else { - task_pool.push([=] { - DeviceTask task_copy = task; - thread_run(task_copy); - }); - } -} - -void CUDADevice::task_wait() -{ - task_pool.wait(); -} - -void CUDADevice::task_cancel() -{ - task_pool.cancel(); -} - -/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class - * now that the definition of that class is complete - */ -# undef cuda_assert -# define cuda_assert(stmt) \ - { \ - CUresult result = stmt; \ - if (result != CUDA_SUCCESS) { \ - const char *name = cuewErrorString(result); \ - device->set_error( \ - string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \ - } \ - } \ - (void)0 - -/* CUDA context scope. */ - -CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device) -{ - cuda_assert(cuCtxPushCurrent(device->cuContext)); -} - -CUDAContextScope::~CUDAContextScope() -{ - cuda_assert(cuCtxPopCurrent(NULL)); -} - -/* split kernel */ - -class CUDASplitKernelFunction : public SplitKernelFunction { - CUDADevice *device; - CUfunction func; - - public: - CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) - { - } - - /* enqueue the kernel, returns false if there is an error */ - bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/) - { - return enqueue(dim, NULL); - } - - /* enqueue the kernel, returns false if there is an error */ - bool enqueue(const KernelDimensions &dim, void *args[]) - { - if (device->have_error()) - return false; - - CUDAContextScope scope(device); - - /* we ignore dim.local_size for now, as this is faster */ - int threads_per_block; - cuda_assert( - cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); - - int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) / - threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); - - cuda_assert(cuLaunchKernel(func, - xblocks, - 1, - 1, /* blocks */ - threads_per_block, - 1, - 1, /* threads */ - 0, - 0, - args, - 0)); - - return !device->have_error(); - } -}; - -CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) -{ -} - -uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/, - device_memory & /*data*/, - size_t num_threads) -{ - CUDAContextScope scope(device); - - device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); - size_buffer.alloc(1); - size_buffer.zero_to_device(); - - uint threads = num_threads; - CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer; - - struct args_t { - uint *num_threads; - CUdeviceptr *size; - }; - - args_t args = {&threads, &d_size}; - - CUfunction state_buffer_size; - cuda_assert( - cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); - - cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0)); - - size_buffer.copy_from_device(0, 1, 1); - size_t size = size_buffer[0]; - size_buffer.free(); - - return size; -} - -bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory & /*kernel_globals*/, - device_memory & /*kernel_data*/, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs) -{ - CUDAContextScope scope(device); - - CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer; - CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer; - CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer; - CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer; - CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer; - - CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer; - - int end_sample = rtile.start_sample + rtile.num_samples; - int queue_size = dim.global_size[0] * dim.global_size[1]; - - struct args_t { - CUdeviceptr *split_data_buffer; - int *num_elements; - CUdeviceptr *ray_state; - int *start_sample; - int *end_sample; - int *sx; - int *sy; - int *sw; - int *sh; - int *offset; - int *stride; - CUdeviceptr *queue_index; - int *queuesize; - CUdeviceptr *use_queues_flag; - CUdeviceptr *work_pool_wgs; - int *num_samples; - CUdeviceptr *buffer; - }; - - args_t args = {&d_split_data, - &num_global_elements, - &d_ray_state, - &rtile.start_sample, - &end_sample, - &rtile.x, - &rtile.y, - &rtile.w, - &rtile.h, - &rtile.offset, - &rtile.stride, - &d_queue_index, - &queue_size, - &d_use_queues_flag, - &d_work_pool_wgs, - &rtile.num_samples, - &d_buffer}; - - CUfunction data_init; - cuda_assert( - cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); - if (device->have_error()) { - return false; - } - - CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args); - - return !device->have_error(); -} - -SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &) -{ - const CUDAContextScope scope(device); - - CUfunction func; - const CUresult result = cuModuleGetFunction( - &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()); - if (result != CUDA_SUCCESS) { - device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)", - kernel_name.data(), - cuewErrorString(result))); - return NULL; - } - - return new CUDASplitKernelFunction(device, func); -} - -int2 CUDASplitKernel::split_kernel_local_size() -{ - return make_int2(32, 1); -} - -int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg, - device_memory &data, - DeviceTask & /*task*/) -{ - CUDAContextScope scope(device); - size_t free; - size_t total; - - cuda_assert(cuMemGetInfo(&free, &total)); - - VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free) - << " bytes. (" << string_human_readable_size(free) << ")."; - - size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); - size_t side = round_down((int)sqrt(num_elements), 32); - int2 global_size = make_int2(side, round_down(num_elements / side, 16)); - VLOG(1) << "Global size: " << global_size << "."; - return global_size; -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp new file mode 100644 index 00000000000..37fab8f8293 --- /dev/null +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -0,0 +1,1370 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include <climits> +# include <limits.h> +# include <stdio.h> +# include <stdlib.h> +# include <string.h> + +# include "device/cuda/device_impl.h" + +# include "render/buffers.h" + +# include "util/util_debug.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_map.h" +# include "util/util_md5.h" +# include "util/util_opengl.h" +# include "util/util_path.h" +# include "util/util_string.h" +# include "util/util_system.h" +# include "util/util_time.h" +# include "util/util_types.h" +# include "util/util_windows.h" + +CCL_NAMESPACE_BEGIN + +class CUDADevice; + +bool CUDADevice::have_precompiled_kernels() +{ + string cubins_path = path_get("lib"); + return path_exists(cubins_path); +} + +bool CUDADevice::show_samples() const +{ + /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ + return true; +} + +BVHLayoutMask CUDADevice::get_bvh_layout_mask() const +{ + return BVH_LAYOUT_BVH2; +} + +void CUDADevice::set_error(const string &error) +{ + Device::set_error(error); + + if (first_error) { + fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } +} + +CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + first_error = true; + + cuDevId = info.num; + cuDevice = 0; + cuContext = 0; + + cuModule = 0; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + pitch_alignment = 0; + + /* Initialize CUDA. */ + CUresult result = cuInit(0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result))); + return; + } + + /* Setup device and context. */ + result = cuDeviceGet(&cuDevice, cuDevId); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)", + cuewErrorString(result))); + return; + } + + /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. + * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, + * so we can predict which memory to map to host. */ + cuda_assert( + cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + + cuda_assert(cuDeviceGetAttribute( + &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); + + unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; + if (can_map_host) { + ctx_flags |= CU_CTX_MAP_HOST; + init_host_memory(); + } + + /* Create context. */ + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); + + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result))); + return; + } + + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + cuDevArchitecture = major * 100 + minor * 10; + + /* Pop context set by cuCtxCreate. */ + cuCtxPopCurrent(NULL); +} + +CUDADevice::~CUDADevice() +{ + texture_info.free(); + + cuda_assert(cuCtxDestroy(cuContext)); +} + +bool CUDADevice::support_device(const uint /*kernel_features*/) +{ + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* We only support sm_30 and above */ + if (major < 3) { + set_error(string_printf( + "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor)); + return false; + } + + return true; +} + +bool CUDADevice::check_peer_access(Device *peer_device) +{ + if (peer_device == this) { + return false; + } + if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) { + return false; + } + + CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device); + + int can_access = 0; + cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice)); + if (can_access == 0) { + return false; + } + + // Ensure array access over the link is possible as well (for 3D textures) + cuda_assert(cuDeviceGetP2PAttribute(&can_access, + CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED, + cuDevice, + peer_device_cuda->cuDevice)); + if (can_access == 0) { + return false; + } + + // Enable peer access in both directions + { + const CUDAContextScope scope(this); + CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to enable peer access on CUDA context (%s)", + cuewErrorString(result))); + return false; + } + } + { + const CUDAContextScope scope(peer_device_cuda); + CUresult result = cuCtxEnablePeerAccess(cuContext, 0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to enable peer access on CUDA context (%s)", + cuewErrorString(result))); + return false; + } + } + + return true; +} + +bool CUDADevice::use_adaptive_compilation() +{ + return DebugFlags().cuda.adaptive_compile; +} + +/* Common NVCC flags which stays the same regardless of shading model, + * kernel sources md5 and only depends on compiler or compilation settings. + */ +string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features) +{ + const int machine = system_cpu_bits(); + const string source_path = path_get("source"); + const string include_path = source_path; + string cflags = string_printf( + "-m%d " + "--ptxas-options=\"-v\" " + "--use_fast_math " + "-DNVCC " + "-I\"%s\"", + machine, + include_path.c_str()); + if (use_adaptive_compilation()) { + cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + } + const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); + if (extra_cflags) { + cflags += string(" ") + string(extra_cflags); + } + +# ifdef WITH_NANOVDB + cflags += " -DWITH_NANOVDB"; +# endif + + return cflags; +} + +string CUDADevice::compile_kernel(const uint kernel_features, + const char *name, + const char *base, + bool force_ptx) +{ + /* Compute kernel name. */ + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* Attempt to use kernel provided with Blender. */ + if (!use_adaptive_compilation()) { + if (!force_ptx) { + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using precompiled kernel."; + return cubin; + } + } + + /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ + int ptx_major = major, ptx_minor = minor; + while (ptx_major >= 3) { + const string ptx = path_get( + string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } + + if (ptx_minor > 0) { + ptx_minor--; + } + else { + ptx_major--; + ptx_minor = 9; + } + } + } + + /* Try to use locally compiled kernel. */ + string source_path = path_get("source"); + const string source_md5 = path_files_md5_hash(source_path); + + /* We include cflags into md5 so changing cuda toolkit or changing other + * compiler command line arguments makes sure cubin gets re-built. + */ + string common_cflags = compile_kernel_get_common_cflags(kernel_features); + const string kernel_md5 = util_md5_string(source_md5 + common_cflags); + + const char *const kernel_ext = force_ptx ? "ptx" : "cubin"; + const char *const kernel_arch = force_ptx ? "compute" : "sm"; + const string cubin_file = string_printf( + "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext); + const string cubin = path_cache_get(path_join("kernels", cubin_file)); + VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using locally compiled kernel."; + return cubin; + } + +# ifdef _WIN32 + if (!use_adaptive_compilation() && have_precompiled_kernels()) { + if (major < 3) { + set_error( + string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. " + "Your GPU is not supported.", + major, + minor)); + } + else { + set_error( + string_printf("CUDA binary kernel for this graphics card compute " + "capability (%d.%d) not found.", + major, + minor)); + } + return string(); + } +# endif + + /* Compile. */ + const char *const nvcc = cuewCompilerPath(); + if (nvcc == NULL) { + set_error( + "CUDA nvcc compiler not found. " + "Install CUDA toolkit in default location."); + return string(); + } + + const int nvcc_cuda_version = cuewCompilerVersion(); + VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << "."; + if (nvcc_cuda_version < 101) { + printf( + "Unsupported CUDA version %d.%d detected, " + "you need CUDA 10.1 or newer.\n", + nvcc_cuda_version / 10, + nvcc_cuda_version % 10); + return string(); + } + else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 || + nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) { + printf( + "CUDA version %d.%d detected, build may succeed but only " + "CUDA 10.1 to 11.4 are officially supported.\n", + nvcc_cuda_version / 10, + nvcc_cuda_version % 10); + } + + double starttime = time_dt(); + + path_create_directories(cubin); + + source_path = path_join(path_join(source_path, "kernel"), + path_join("device", path_join(base, string_printf("%s.cu", name)))); + + string command = string_printf( + "\"%s\" " + "-arch=%s_%d%d " + "--%s \"%s\" " + "-o \"%s\" " + "%s", + nvcc, + kernel_arch, + major, + minor, + kernel_ext, + source_path.c_str(), + cubin.c_str(), + common_cflags.c_str()); + + printf("Compiling CUDA kernel ...\n%s\n", command.c_str()); + +# ifdef _WIN32 + command = "call " + command; +# endif + if (system(command.c_str()) != 0) { + set_error( + "Failed to execute compilation command, " + "see console for details."); + return string(); + } + + /* Verify if compilation succeeded */ + if (!path_exists(cubin)) { + set_error( + "CUDA kernel compilation failed, " + "see console for details."); + return string(); + } + + printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); + + return cubin; +} + +bool CUDADevice::load_kernels(const uint kernel_features) +{ + /* TODO(sergey): Support kernels re-load for CUDA devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in cuCtxSynchronize. + */ + if (cuModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + + /* check if cuda init succeeded */ + if (cuContext == 0) + return false; + + /* check if GPU is supported */ + if (!support_device(kernel_features)) + return false; + + /* get kernel */ + const char *kernel_name = "kernel"; + string cubin = compile_kernel(kernel_features, kernel_name); + if (cubin.empty()) + return false; + + /* open module */ + CUDAContextScope scope(this); + + string cubin_data; + CUresult result; + + if (path_read_text(cubin, cubin_data)) + result = cuModuleLoadData(&cuModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if (result != CUDA_SUCCESS) + set_error(string_printf( + "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result))); + + if (result == CUDA_SUCCESS) { + kernels.load(this); + reserve_local_memory(kernel_features); + } + + return (result == CUDA_SUCCESS); +} + +void CUDADevice::reserve_local_memory(const uint /* kernel_features */) +{ + /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory + * needed for kernel launches, so that we can reliably figure out when + * to allocate scene data in mapped host memory. */ + size_t total = 0, free_before = 0, free_after = 0; + + { + CUDAContextScope scope(this); + cuMemGetInfo(&free_before, &total); + } + + { + /* Use the biggest kernel for estimation. */ + const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE; + + /* Launch kernel, using just 1 block appears sufficient to reserve memory for all + * multiprocessors. It would be good to do this in parallel for the multi GPU case + * still to make it faster. */ + CUDADeviceQueue queue(this); + + void *d_path_index = nullptr; + void *d_render_buffer = nullptr; + int d_work_size = 0; + void *args[] = {&d_path_index, &d_render_buffer, &d_work_size}; + + queue.init_execution(); + queue.enqueue(test_kernel, 1, args); + queue.synchronize(); + } + + { + CUDAContextScope scope(this); + cuMemGetInfo(&free_after, &total); + } + + VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; + +# if 0 + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while (free_after > keep_mb * 1024 * 1024LL) { + CUdeviceptr tmp; + cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); + cuMemGetInfo(&free_after, &total); + } +# endif +} + +void CUDADevice::init_host_memory() +{ + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; +} + +void CUDADevice::load_texture_info() +{ + if (need_texture_info) { + /* Unset flag before copying, so this does not loop indefinitely if the copy below calls + * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ + need_texture_info = false; + texture_info.copy_to_device(); + } +} + +void CUDADevice::move_textures_to_host(size_t size, bool for_texture) +{ + /* Break out of recursive call, which can happen when moving memory on a multi device. */ + static bool any_device_moving_textures_to_host = false; + if (any_device_moving_textures_to_host) { + return; + } + + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + thread_scoped_lock lock(cuda_mem_map_mutex); + foreach (CUDAMemMap::value_type &pair, cuda_mem_map) { + device_memory &mem = *pair.first; + CUDAMem *cmem = &pair.second; + + /* Can only move textures allocated on this device (and not those from peer devices). + * And need to ignore memory that is already on the host. */ + if (!mem.is_resident(this) || cmem->use_mapped_host) { + continue; + } + + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && + (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + lock.unlock(); + + /* Move to host memory. This part is mutex protected since + * multiple CUDA devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + any_device_moving_textures_to_host = true; + + /* Potentially need to call back into multi device, so pointer mapping + * and peer devices are updated. This is also necessary since the device + * pointer may just be a key here, so cannot be accessed and freed directly. + * Unfortunately it does mean that memory is reallocated on all other + * devices as well, which is potentially dangerous when still in use (since + * a thread rendering on another devices would only be caught in this mutex + * if it so happens to do an allocation at the same time as well. */ + max_mem->device_copy_to(); + size = (max_size >= size) ? 0 : size - max_size; + + any_device_moving_textures_to_host = false; + } + else { + break; + } + } + + /* Unset flag before texture info is reloaded, since it should stay in device memory. */ + move_texture_to_host = false; + + /* Update texture info array with new pointers. */ + load_texture_info(); +} + +CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding) +{ + CUDAContextScope scope(this); + + CUdeviceptr device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; + + size_t total = 0, free = 0; + cuMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { + move_textures_to_host(size + headroom - free, is_texture); + cuMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = cuMemAlloc(&device_pointer, size); + if (mem_alloc_result == CUDA_SUCCESS) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + + void *shared_pointer = 0; + + if (mem_alloc_result != CUDA_SUCCESS && can_map_host) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = CUDA_SUCCESS; + shared_pointer = mem.shared_pointer; + } + else if (map_host_used + size < map_host_limit) { + /* Allocate host memory ourselves. */ + mem_alloc_result = cuMemHostAlloc( + &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); + + assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || + (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); + } + + if (mem_alloc_result == CUDA_SUCCESS) { + cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + } + } + + if (mem_alloc_result != CUDA_SUCCESS) { + status = " failed, out of device and host memory"; + set_error("System is out of GPU and shared host memory"); + } + + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + thread_scoped_lock lock(cuda_mem_map_mutex); + CUDAMem *cmem = &cuda_mem_map[&mem]; + if (shared_pointer != 0) { + /* Replace host pointer with our host allocation. Only works if + * CUDA memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != shared_pointer) { + memcpy(shared_pointer, mem.host_pointer, size); + + /* A Call to device_memory::host_free() should be preceded by + * a call to device_memory::device_free() for host memory + * allocated by a device to be handled properly. Two exceptions + * are here and a call in OptiXDevice::generic_alloc(), where + * the current host memory can be assumed to be allocated by + * device_memory::host_alloc(), not by a device */ + + mem.host_free(); + mem.host_pointer = shared_pointer; + } + mem.shared_pointer = shared_pointer; + mem.shared_counter++; + cmem->use_mapped_host = true; + } + else { + cmem->use_mapped_host = false; + } + + return cmem; +} + +void CUDADevice::generic_copy_to(device_memory &mem) +{ + if (!mem.host_pointer || !mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, the current device only uses device memory allocated by + * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from + * mem.host_pointer. */ + thread_scoped_lock lock(cuda_mem_map_mutex); + if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const CUDAContextScope scope(this); + cuda_assert( + cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size())); + } +} + +void CUDADevice::generic_free(device_memory &mem) +{ + if (mem.device_pointer) { + CUDAContextScope scope(this); + thread_scoped_lock lock(cuda_mem_map_mutex); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + cuMemFreeHost(mem.shared_pointer); + mem.shared_pointer = 0; + } + } + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + cuda_assert(cuMemFree(mem.device_pointer)); + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } +} + +void CUDADevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + generic_alloc(mem); + } +} + +void CUDADevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + generic_copy_to(mem); + } +} + +void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) +{ + if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { + assert(!"mem_copy_from not supported for textures."); + } + else if (mem.host_pointer) { + const size_t size = elem * w * h; + const size_t offset = elem * y * w; + + if (mem.device_pointer) { + const CUDAContextScope scope(this); + cuda_assert(cuMemcpyDtoH( + (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); + } + else { + memset((char *)mem.host_pointer + offset, 0, size); + } + } +} + +void CUDADevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + if (!mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory + * regardless of mem.host_pointer and mem.shared_pointer. */ + thread_scoped_lock lock(cuda_mem_map_mutex); + if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const CUDAContextScope scope(this); + cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); + } + else if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } +} + +void CUDADevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else { + generic_free(mem); + } +} + +device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) +{ + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); +} + +void CUDADevice::const_copy_to(const char *name, void *host, size_t size) +{ + CUDAContextScope scope(this); + CUdeviceptr mem; + size_t bytes; + + cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); + // assert(bytes == size); + cuda_assert(cuMemcpyHtoD(mem, host, size)); +} + +void CUDADevice::global_alloc(device_memory &mem) +{ + if (mem.is_resident(this)) { + generic_alloc(mem); + generic_copy_to(mem); + } + + const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); +} + +void CUDADevice::global_free(device_memory &mem) +{ + if (mem.is_resident(this) && mem.device_pointer) { + generic_free(mem); + } +} + +void CUDADevice::tex_alloc(device_texture &mem) +{ + CUDAContextScope scope(this); + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; + switch (mem.info.extension) { + case EXTENSION_REPEAT: + address_mode = CU_TR_ADDRESS_MODE_WRAP; + break; + case EXTENSION_EXTEND: + address_mode = CU_TR_ADDRESS_MODE_CLAMP; + break; + case EXTENSION_CLIP: + address_mode = CU_TR_ADDRESS_MODE_BORDER; + break; + default: + assert(0); + break; + } + + CUfilter_mode filter_mode; + if (mem.info.interpolation == INTERPOLATION_CLOSEST) { + filter_mode = CU_TR_FILTER_MODE_POINT; + } + else { + filter_mode = CU_TR_FILTER_MODE_LINEAR; + } + + /* Image Texture Storage */ + CUarray_format_enum format; + switch (mem.data_type) { + case TYPE_UCHAR: + format = CU_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = CU_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = CU_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = CU_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = CU_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = CU_AD_FORMAT_HALF; + break; + default: + assert(0); + return; + } + + CUDAMem *cmem = NULL; + CUarray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; + + if (!mem.is_resident(this)) { + thread_scoped_lock lock(cuda_mem_map_mutex); + cmem = &cuda_mem_map[&mem]; + cmem->texobject = 0; + + if (mem.data_depth > 1) { + array_3d = (CUarray)mem.device_pointer; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + dst_pitch = align_up(src_pitch, pitch_alignment); + } + } + else if (mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + CUDA_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + cuda_assert(cuArray3DCreate(&array_3d, &desc)); + + if (!array_3d) { + return; + } + + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = array_3d; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + cuda_assert(cuMemcpy3D(¶m)); + + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); + + thread_scoped_lock lock(cuda_mem_map_mutex); + cmem = &cuda_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + dst_pitch = align_up(src_pitch, pitch_alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if (!cmem) { + return; + } + + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_DEVICE; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + cuda_assert(cuMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if (!cmem) { + return; + } + + cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } + + /* Resize once */ + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(slot + 128); + } + + /* Set Mapping and tag that we need to (re-)upload to device */ + texture_info[slot] = mem.info; + need_texture_info = true; + + if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + /* Kepler+, bindless textures. */ + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if (array_3d) { + resDesc.resType = CU_RESOURCE_TYPE_ARRAY; + resDesc.res.array.hArray = array_3d; + resDesc.flags = 0; + } + else if (mem.data_height > 0) { + resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = CU_RESOURCE_TYPE_LINEAR; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } + + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + thread_scoped_lock lock(cuda_mem_map_mutex); + cmem = &cuda_mem_map[&mem]; + + cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + texture_info[slot].data = (uint64_t)cmem->texobject; + } + else { + texture_info[slot].data = (uint64_t)mem.device_pointer; + } +} + +void CUDADevice::tex_free(device_texture &mem) +{ + if (mem.device_pointer) { + CUDAContextScope scope(this); + thread_scoped_lock lock(cuda_mem_map_mutex); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + if (cmem.texobject) { + /* Free bindless texture. */ + cuTexObjectDestroy(cmem.texobject); + } + + if (!mem.is_resident(this)) { + /* Do not free memory here, since it was allocated on a different device. */ + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + else if (cmem.array) { + /* Free array. */ + cuArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + else { + lock.unlock(); + generic_free(mem); + } + } +} + +# if 0 +void CUDADevice::render(DeviceTask &task, + RenderTile &rtile, + device_vector<KernelWorkTile> &work_tiles) +{ + scoped_timer timer(&rtile.buffers->render_time); + + if (have_error()) + return; + + CUDAContextScope scope(this); + CUfunction cuRender; + + /* Get kernel function. */ + if (rtile.task == RenderTile::BAKE) { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); + } + else { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); + } + + if (have_error()) { + return; + } + + cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); + + /* Allocate work tile. */ + work_tiles.alloc(1); + + KernelWorkTile *wtile = work_tiles.data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float *)(CUdeviceptr)rtile.buffer; + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + cuda_assert( + cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); + if (!info.display_device) { + min_blocks *= 8; + } + + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + + /* Render all samples. */ + uint start_sample = rtile.start_sample; + uint end_sample = rtile.start_sample + rtile.num_samples; + + for (int sample = start_sample; sample < end_sample;) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = step_samples; + if (task.adaptive_sampling.use) { + wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); + } + wtile->num_samples = min(wtile->num_samples, end_sample - sample); + work_tiles.copy_to_device(); + + CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, &total_work_size}; + + cuda_assert( + cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ + uint filter_sample = sample + wtile->num_samples - 1; + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { + adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); + } + + cuda_assert(cuCtxSynchronize()); + + /* Update progress. */ + sample += wtile->num_samples; + rtile.sample = sample; + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + /* Finalize adaptive sampling. */ + if (task.adaptive_sampling.use) { + CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; + adaptive_sampling_post(rtile, wtile, d_work_tiles); + cuda_assert(cuCtxSynchronize()); + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + } +} + +void CUDADevice::thread_run(DeviceTask &task) +{ + CUDAContextScope scope(this); + + if (task.type == DeviceTask::RENDER) { + device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); + + /* keep rendering tiles until done */ + RenderTile tile; + DenoisingTask denoising(this, task); + + while (task.acquire_tile(this, tile, task.tile_types)) { + if (tile.task == RenderTile::PATH_TRACE) { + render(task, tile, work_tiles); + } + else if (tile.task == RenderTile::BAKE) { + render(task, tile, work_tiles); + } + + task.release_tile(tile); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + work_tiles.free(); + } +} +# endif + +unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create() +{ + return make_unique<CUDADeviceQueue>(this); +} + +bool CUDADevice::should_use_graphics_interop() +{ + /* Check whether this device is part of OpenGL context. + * + * Using CUDA device for graphics interoperability which is not part of the OpenGL context is + * possible, but from the empiric measurements it can be considerably slower than using naive + * pixels copy. */ + + CUDAContextScope scope(this); + + int num_all_devices = 0; + cuda_assert(cuDeviceGetCount(&num_all_devices)); + + if (num_all_devices == 0) { + return false; + } + + vector<CUdevice> gl_devices(num_all_devices); + uint num_gl_devices; + cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL); + + for (CUdevice gl_device : gl_devices) { + if (gl_device == cuDevice) { + return true; + } + } + + return false; +} + +int CUDADevice::get_num_multiprocessors() +{ + return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0); +} + +int CUDADevice::get_max_num_threads_per_multiprocessor() +{ + return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0); +} + +bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value) +{ + CUDAContextScope scope(this); + + return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS; +} + +int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value) +{ + int value = 0; + if (!get_device_attribute(attribute, &value)) { + return default_value; + } + return value; +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h new file mode 100644 index 00000000000..6b27db54ab4 --- /dev/null +++ b/intern/cycles/device/cuda/device_impl.h @@ -0,0 +1,155 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/kernel.h" +# include "device/cuda/queue.h" +# include "device/cuda/util.h" +# include "device/device.h" + +# include "util/util_map.h" + +# ifdef WITH_CUDA_DYNLOAD +# include "cuew.h" +# else +# include "util/util_opengl.h" +# include <cuda.h> +# include <cudaGL.h> +# endif + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +class CUDADevice : public Device { + + friend class CUDAContextScope; + + public: + CUdevice cuDevice; + CUcontext cuContext; + CUmodule cuModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; + int pitch_alignment; + int cuDevId; + int cuDevArchitecture; + bool first_error; + + struct CUDAMem { + CUDAMem() : texobject(0), array(0), use_mapped_host(false) + { + } + + CUtexObject texobject; + CUarray array; + + /* If true, a mapped host memory in shared_pointer is being used. */ + bool use_mapped_host; + }; + typedef map<device_memory *, CUDAMem> CUDAMemMap; + CUDAMemMap cuda_mem_map; + thread_mutex cuda_mem_map_mutex; + + /* Bindless Textures */ + device_vector<TextureInfo> texture_info; + bool need_texture_info; + + CUDADeviceKernels kernels; + + static bool have_precompiled_kernels(); + + virtual bool show_samples() const override; + + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + void set_error(const string &error) override; + + CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); + + virtual ~CUDADevice(); + + bool support_device(const uint /*kernel_features*/); + + bool check_peer_access(Device *peer_device) override; + + bool use_adaptive_compilation(); + + virtual string compile_kernel_get_common_cflags(const uint kernel_features); + + string compile_kernel(const uint kernel_features, + const char *name, + const char *base = "cuda", + bool force_ptx = false); + + virtual bool load_kernels(const uint kernel_features) override; + + void reserve_local_memory(const uint kernel_features); + + void init_host_memory(); + + void load_texture_info(); + + void move_textures_to_host(size_t size, bool for_texture); + + CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); + + void generic_copy_to(device_memory &mem); + + void generic_free(device_memory &mem); + + void mem_alloc(device_memory &mem) override; + + void mem_copy_to(device_memory &mem) override; + + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; + + void mem_zero(device_memory &mem) override; + + void mem_free(device_memory &mem) override; + + device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + + void tex_free(device_texture &mem); + + virtual bool should_use_graphics_interop() override; + + virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + + int get_num_multiprocessors(); + int get_max_num_threads_per_multiprocessor(); + + protected: + bool get_device_attribute(CUdevice_attribute attribute, int *value); + int get_device_default_attribute(CUdevice_attribute attribute, int default_value); +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp new file mode 100644 index 00000000000..e8ca8b90eae --- /dev/null +++ b/intern/cycles/device/cuda/graphics_interop.cpp @@ -0,0 +1,102 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/graphics_interop.h" + +# include "device/cuda/device_impl.h" +# include "device/cuda/util.h" + +CCL_NAMESPACE_BEGIN + +CUDADeviceGraphicsInterop::CUDADeviceGraphicsInterop(CUDADeviceQueue *queue) + : queue_(queue), device_(static_cast<CUDADevice *>(queue->device)) +{ +} + +CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop() +{ + CUDAContextScope scope(device_); + + if (cu_graphics_resource_) { + cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_)); + } +} + +void CUDADeviceGraphicsInterop::set_destination( + const DeviceGraphicsInteropDestination &destination) +{ + const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height; + + need_clear_ = destination.need_clear; + + if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) { + return; + } + + CUDAContextScope scope(device_); + + if (cu_graphics_resource_) { + cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_)); + } + + const CUresult result = cuGraphicsGLRegisterBuffer( + &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); + if (result != CUDA_SUCCESS) { + LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result); + } + + opengl_pbo_id_ = destination.opengl_pbo_id; + buffer_area_ = new_buffer_area; +} + +device_ptr CUDADeviceGraphicsInterop::map() +{ + if (!cu_graphics_resource_) { + return 0; + } + + CUDAContextScope scope(device_); + + CUdeviceptr cu_buffer; + size_t bytes; + + cuda_device_assert(device_, cuGraphicsMapResources(1, &cu_graphics_resource_, queue_->stream())); + cuda_device_assert( + device_, cuGraphicsResourceGetMappedPointer(&cu_buffer, &bytes, cu_graphics_resource_)); + + if (need_clear_) { + cuda_device_assert( + device_, cuMemsetD8Async(static_cast<CUdeviceptr>(cu_buffer), 0, bytes, queue_->stream())); + + need_clear_ = false; + } + + return static_cast<device_ptr>(cu_buffer); +} + +void CUDADeviceGraphicsInterop::unmap() +{ + CUDAContextScope scope(device_); + + cuda_device_assert(device_, + cuGraphicsUnmapResources(1, &cu_graphics_resource_, queue_->stream())); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h new file mode 100644 index 00000000000..8a70c8aa71d --- /dev/null +++ b/intern/cycles/device/cuda/graphics_interop.h @@ -0,0 +1,66 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/device_graphics_interop.h" + +# ifdef WITH_CUDA_DYNLOAD +# include "cuew.h" +# else +# include <cuda.h> +# endif + +CCL_NAMESPACE_BEGIN + +class CUDADevice; +class CUDADeviceQueue; + +class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop { + public: + explicit CUDADeviceGraphicsInterop(CUDADeviceQueue *queue); + + CUDADeviceGraphicsInterop(const CUDADeviceGraphicsInterop &other) = delete; + CUDADeviceGraphicsInterop(CUDADeviceGraphicsInterop &&other) noexcept = delete; + + ~CUDADeviceGraphicsInterop(); + + CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete; + CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete; + + virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override; + + virtual device_ptr map() override; + virtual void unmap() override; + + protected: + CUDADeviceQueue *queue_ = nullptr; + CUDADevice *device_ = nullptr; + + /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */ + uint opengl_pbo_id_ = 0; + /* Buffer area in pixels of the corresponding PBO. */ + int64_t buffer_area_ = 0; + + /* The destination was requested to be cleared. */ + bool need_clear_ = false; + + CUgraphicsResource cu_graphics_resource_ = nullptr; +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp new file mode 100644 index 00000000000..a4a7bfabce0 --- /dev/null +++ b/intern/cycles/device/cuda/kernel.cpp @@ -0,0 +1,69 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/kernel.h" +# include "device/cuda/device_impl.h" + +CCL_NAMESPACE_BEGIN + +void CUDADeviceKernels::load(CUDADevice *device) +{ + CUmodule cuModule = device->cuModule; + + for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) { + CUDADeviceKernel &kernel = kernels_[i]; + + /* No mega-kernel used for GPU. */ + if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + continue; + } + + const std::string function_name = std::string("kernel_gpu_") + + device_kernel_as_string((DeviceKernel)i); + cuda_device_assert(device, + cuModuleGetFunction(&kernel.function, cuModule, function_name.c_str())); + + if (kernel.function) { + cuda_device_assert(device, cuFuncSetCacheConfig(kernel.function, CU_FUNC_CACHE_PREFER_L1)); + + cuda_device_assert( + device, + cuOccupancyMaxPotentialBlockSize( + &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, NULL, 0, 0)); + } + else { + LOG(ERROR) << "Unable to load kernel " << function_name; + } + } + + loaded = true; +} + +const CUDADeviceKernel &CUDADeviceKernels::get(DeviceKernel kernel) const +{ + return kernels_[(int)kernel]; +} + +bool CUDADeviceKernels::available(DeviceKernel kernel) const +{ + return kernels_[(int)kernel].function != nullptr; +} + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA*/ diff --git a/intern/cycles/device/cuda/kernel.h b/intern/cycles/device/cuda/kernel.h new file mode 100644 index 00000000000..b489547a350 --- /dev/null +++ b/intern/cycles/device/cuda/kernel.h @@ -0,0 +1,56 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_CUDA + +# include "device/device_kernel.h" + +# ifdef WITH_CUDA_DYNLOAD +# include "cuew.h" +# else +# include <cuda.h> +# endif + +CCL_NAMESPACE_BEGIN + +class CUDADevice; + +/* CUDA kernel and associate occupancy information. */ +class CUDADeviceKernel { + public: + CUfunction function = nullptr; + + int num_threads_per_block = 0; + int min_blocks = 0; +}; + +/* Cache of CUDA kernels for each DeviceKernel. */ +class CUDADeviceKernels { + public: + void load(CUDADevice *device); + const CUDADeviceKernel &get(DeviceKernel kernel) const; + bool available(DeviceKernel kernel) const; + + protected: + CUDADeviceKernel kernels_[DEVICE_KERNEL_NUM]; + bool loaded = false; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp new file mode 100644 index 00000000000..b7f86c10553 --- /dev/null +++ b/intern/cycles/device/cuda/queue.cpp @@ -0,0 +1,220 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/queue.h" + +# include "device/cuda/device_impl.h" +# include "device/cuda/graphics_interop.h" +# include "device/cuda/kernel.h" + +CCL_NAMESPACE_BEGIN + +/* CUDADeviceQueue */ + +CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device) + : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr) +{ + const CUDAContextScope scope(cuda_device_); + cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING)); +} + +CUDADeviceQueue::~CUDADeviceQueue() +{ + const CUDAContextScope scope(cuda_device_); + cuStreamDestroy(cuda_stream_); +} + +int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const +{ + int num_states = max(cuda_device_->get_num_multiprocessors() * + cuda_device_->get_max_num_threads_per_multiprocessor() * 16, + 1048576); + + const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR"); + if (factor_str) { + num_states = max((int)(num_states * atof(factor_str)), 1024); + } + + VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to " + << string_human_readable_size(num_states * state_size); + + return num_states; +} + +int CUDADeviceQueue::num_concurrent_busy_states() const +{ + const int max_num_threads = cuda_device_->get_num_multiprocessors() * + cuda_device_->get_max_num_threads_per_multiprocessor(); + + if (max_num_threads == 0) { + return 65536; + } + + return 4 * max_num_threads; +} + +void CUDADeviceQueue::init_execution() +{ + /* Synchronize all textures and memory copies before executing task. */ + CUDAContextScope scope(cuda_device_); + cuda_device_->load_texture_info(); + cuda_device_assert(cuda_device_, cuCtxSynchronize()); + + debug_init_execution(); +} + +bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const +{ + return cuda_device_->kernels.available(kernel); +} + +bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +{ + if (cuda_device_->have_error()) { + return false; + } + + debug_enqueue(kernel, work_size); + + const CUDAContextScope scope(cuda_device_); + const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel); + + /* Compute kernel launch parameters. */ + const int num_threads_per_block = cuda_kernel.num_threads_per_block; + const int num_blocks = divide_up(work_size, num_threads_per_block); + + int shared_mem_bytes = 0; + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + /* See parall_active_index.h for why this amount of shared memory is needed. */ + shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int); + break; + + default: + break; + } + + /* Launch kernel. */ + cuda_device_assert(cuda_device_, + cuLaunchKernel(cuda_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + cuda_stream_, + args, + 0)); + + return !(cuda_device_->have_error()); +} + +bool CUDADeviceQueue::synchronize() +{ + if (cuda_device_->have_error()) { + return false; + } + + const CUDAContextScope scope(cuda_device_); + cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); + debug_synchronize(); + + return !(cuda_device_->have_error()); +} + +void CUDADeviceQueue::zero_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + cuda_device_->mem_alloc(mem); + } + + /* Zero memory on device. */ + assert(mem.device_pointer != 0); + + const CUDAContextScope scope(cuda_device_); + cuda_device_assert( + cuda_device_, + cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_)); +} + +void CUDADeviceQueue::copy_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + cuda_device_->mem_alloc(mem); + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory to device. */ + const CUDAContextScope scope(cuda_device_); + cuda_device_assert( + cuda_device_, + cuMemcpyHtoDAsync( + (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_)); +} + +void CUDADeviceQueue::copy_from_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory from device. */ + const CUDAContextScope scope(cuda_device_); + cuda_device_assert( + cuda_device_, + cuMemcpyDtoHAsync( + mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_)); +} + +unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create() +{ + return make_unique<CUDADeviceGraphicsInterop>(this); +} + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h new file mode 100644 index 00000000000..62e3aa3d6c2 --- /dev/null +++ b/intern/cycles/device/cuda/queue.h @@ -0,0 +1,67 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_CUDA + +# include "device/device_kernel.h" +# include "device/device_memory.h" +# include "device/device_queue.h" + +# include "device/cuda/util.h" + +CCL_NAMESPACE_BEGIN + +class CUDADevice; +class device_memory; + +/* Base class for CUDA queues. */ +class CUDADeviceQueue : public DeviceQueue { + public: + CUDADeviceQueue(CUDADevice *device); + ~CUDADeviceQueue(); + + virtual int num_concurrent_states(const size_t state_size) const override; + virtual int num_concurrent_busy_states() const override; + + virtual void init_execution() override; + + virtual bool kernel_available(DeviceKernel kernel) const override; + + virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; + + virtual bool synchronize() override; + + virtual void zero_to_device(device_memory &mem) override; + virtual void copy_to_device(device_memory &mem) override; + virtual void copy_from_device(device_memory &mem) override; + + virtual CUstream stream() + { + return cuda_stream_; + } + + virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override; + + protected: + CUDADevice *cuda_device_; + CUstream cuda_stream_; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ diff --git a/intern/cycles/device/cuda/util.cpp b/intern/cycles/device/cuda/util.cpp new file mode 100644 index 00000000000..8f657cc10fe --- /dev/null +++ b/intern/cycles/device/cuda/util.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/util.h" +# include "device/cuda/device_impl.h" + +CCL_NAMESPACE_BEGIN + +CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device) +{ + cuda_device_assert(device, cuCtxPushCurrent(device->cuContext)); +} + +CUDAContextScope::~CUDAContextScope() +{ + cuda_device_assert(device, cuCtxPopCurrent(NULL)); +} + +# ifndef WITH_CUDA_DYNLOAD +const char *cuewErrorString(CUresult result) +{ + /* We can only give error code here without major code duplication, that + * should be enough since dynamic loading is only being disabled by folks + * who knows what they're doing anyway. + * + * NOTE: Avoid call from several threads. + */ + static string error; + error = string_printf("%d", result); + return error.c_str(); +} + +const char *cuewCompilerPath() +{ + return CYCLES_CUDA_NVCC_EXECUTABLE; +} + +int cuewCompilerVersion() +{ + return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); +} +# endif + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ diff --git a/intern/cycles/device/cuda/util.h b/intern/cycles/device/cuda/util.h new file mode 100644 index 00000000000..a0898094c08 --- /dev/null +++ b/intern/cycles/device/cuda/util.h @@ -0,0 +1,65 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_CUDA + +# ifdef WITH_CUDA_DYNLOAD +# include "cuew.h" +# else +# include <cuda.h> +# endif + +CCL_NAMESPACE_BEGIN + +class CUDADevice; + +/* Utility to push/pop CUDA context. */ +class CUDAContextScope { + public: + CUDAContextScope(CUDADevice *device); + ~CUDAContextScope(); + + private: + CUDADevice *device; +}; + +/* Utility for checking return values of CUDA function calls. */ +# define cuda_device_assert(cuda_device, stmt) \ + { \ + CUresult result = stmt; \ + if (result != CUDA_SUCCESS) { \ + const char *name = cuewErrorString(result); \ + cuda_device->set_error( \ + string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \ + } \ + } \ + (void)0 + +# define cuda_assert(stmt) cuda_device_assert(this, stmt) + +# ifndef WITH_CUDA_DYNLOAD +/* Transparently implement some functions, so majority of the file does not need + * to worry about difference between dynamically loaded and linked CUDA at all. */ +const char *cuewErrorString(CUresult result); +const char *cuewCompilerPath(); +int cuewCompilerVersion(); +# endif /* WITH_CUDA_DYNLOAD */ + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index ed53fbb54ae..6ccedcf54ef 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -20,7 +20,13 @@ #include "bvh/bvh2.h" #include "device/device.h" -#include "device/device_intern.h" +#include "device/device_queue.h" + +#include "device/cpu/device.h" +#include "device/cuda/device.h" +#include "device/dummy/device.h" +#include "device/multi/device.h" +#include "device/optix/device.h" #include "util/util_foreach.h" #include "util/util_half.h" @@ -38,332 +44,15 @@ CCL_NAMESPACE_BEGIN bool Device::need_types_update = true; bool Device::need_devices_update = true; thread_mutex Device::device_mutex; -vector<DeviceInfo> Device::opencl_devices; vector<DeviceInfo> Device::cuda_devices; vector<DeviceInfo> Device::optix_devices; vector<DeviceInfo> Device::cpu_devices; -vector<DeviceInfo> Device::network_devices; uint Device::devices_initialized_mask = 0; -/* Device Requested Features */ - -std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features) -{ - os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl; - os << "Max nodes group: " << requested_features.max_nodes_group << std::endl; - /* TODO(sergey): Decode bitflag into list of names. */ - os << "Nodes features: " << requested_features.nodes_features << std::endl; - os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl; - os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion) - << std::endl; - os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion) - << std::endl; - os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl; - os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl; - os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl; - os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched) - << std::endl; - os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation) - << std::endl; - os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent) - << std::endl; - os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled) - << std::endl; - os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl; - os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement) - << std::endl; - os << "Use Background Light: " << string_from_bool(requested_features.use_background_light) - << std::endl; - return os; -} - /* Device */ Device::~Device() noexcept(false) { - if (!background) { - if (vertex_buffer != 0) { - glDeleteBuffers(1, &vertex_buffer); - } - if (fallback_shader_program != 0) { - glDeleteProgram(fallback_shader_program); - } - } -} - -/* TODO move shaders to standalone .glsl file. */ -const char *FALLBACK_VERTEX_SHADER = - "#version 330\n" - "uniform vec2 fullscreen;\n" - "in vec2 texCoord;\n" - "in vec2 pos;\n" - "out vec2 texCoord_interp;\n" - "\n" - "vec2 normalize_coordinates()\n" - "{\n" - " return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n" - "}\n" - "\n" - "void main()\n" - "{\n" - " gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n" - " texCoord_interp = texCoord;\n" - "}\n\0"; - -const char *FALLBACK_FRAGMENT_SHADER = - "#version 330\n" - "uniform sampler2D image_texture;\n" - "in vec2 texCoord_interp;\n" - "out vec4 fragColor;\n" - "\n" - "void main()\n" - "{\n" - " fragColor = texture(image_texture, texCoord_interp);\n" - "}\n\0"; - -static void shader_print_errors(const char *task, const char *log, const char *code) -{ - LOG(ERROR) << "Shader: " << task << " error:"; - LOG(ERROR) << "===== shader string ===="; - - stringstream stream(code); - string partial; - - int line = 1; - while (getline(stream, partial, '\n')) { - if (line < 10) { - LOG(ERROR) << " " << line << " " << partial; - } - else { - LOG(ERROR) << line << " " << partial; - } - line++; - } - LOG(ERROR) << log; -} - -static int bind_fallback_shader(void) -{ - GLint status; - GLchar log[5000]; - GLsizei length = 0; - GLuint program = 0; - - struct Shader { - const char *source; - GLenum type; - } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER}, - {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}}; - - program = glCreateProgram(); - - for (int i = 0; i < 2; i++) { - GLuint shader = glCreateShader(shaders[i].type); - - string source_str = shaders[i].source; - const char *c_str = source_str.c_str(); - - glShaderSource(shader, 1, &c_str, NULL); - glCompileShader(shader); - - glGetShaderiv(shader, GL_COMPILE_STATUS, &status); - - if (!status) { - glGetShaderInfoLog(shader, sizeof(log), &length, log); - shader_print_errors("compile", log, c_str); - return 0; - } - - glAttachShader(program, shader); - } - - /* Link output. */ - glBindFragDataLocation(program, 0, "fragColor"); - - /* Link and error check. */ - glLinkProgram(program); - - glGetProgramiv(program, GL_LINK_STATUS, &status); - if (!status) { - glGetShaderInfoLog(program, sizeof(log), &length, log); - shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER); - shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER); - return 0; - } - - return program; -} - -bool Device::bind_fallback_display_space_shader(const float width, const float height) -{ - if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) { - return false; - } - - if (fallback_status == FALLBACK_SHADER_STATUS_NONE) { - fallback_shader_program = bind_fallback_shader(); - fallback_status = FALLBACK_SHADER_STATUS_ERROR; - - if (fallback_shader_program == 0) { - return false; - } - - glUseProgram(fallback_shader_program); - image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture"); - if (image_texture_location < 0) { - LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform."; - return false; - } - - fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen"); - if (fullscreen_location < 0) { - LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform."; - return false; - } - - fallback_status = FALLBACK_SHADER_STATUS_SUCCESS; - } - - /* Run this every time. */ - glUseProgram(fallback_shader_program); - glUniform1i(image_texture_location, 0); - glUniform2f(fullscreen_location, width, height); - return true; -} - -void Device::draw_pixels(device_memory &rgba, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params) -{ - const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); - - assert(rgba.type == MEM_PIXELS); - mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1)); - - GLuint texid; - glActiveTexture(GL_TEXTURE0); - glGenTextures(1, &texid); - glBindTexture(GL_TEXTURE_2D, texid); - - if (rgba.data_type == TYPE_HALF) { - GLhalf *data_pointer = (GLhalf *)rgba.host_pointer; - data_pointer += 4 * y * w; - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer); - } - else { - uint8_t *data_pointer = (uint8_t *)rgba.host_pointer; - data_pointer += 4 * y * w; - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer); - } - - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - - if (transparent) { - glEnable(GL_BLEND); - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - } - - GLint shader_program; - if (use_fallback_shader) { - if (!bind_fallback_display_space_shader(dw, dh)) { - return; - } - shader_program = fallback_shader_program; - } - else { - draw_params.bind_display_space_shader_cb(); - glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); - } - - if (!vertex_buffer) { - glGenBuffers(1, &vertex_buffer); - } - - glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); - /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered - */ - glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); - - float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); - - if (vpointer) { - /* texture coordinate - vertex pair */ - vpointer[0] = 0.0f; - vpointer[1] = 0.0f; - vpointer[2] = dx; - vpointer[3] = dy; - - vpointer[4] = 1.0f; - vpointer[5] = 0.0f; - vpointer[6] = (float)width + dx; - vpointer[7] = dy; - - vpointer[8] = 1.0f; - vpointer[9] = 1.0f; - vpointer[10] = (float)width + dx; - vpointer[11] = (float)height + dy; - - vpointer[12] = 0.0f; - vpointer[13] = 1.0f; - vpointer[14] = dx; - vpointer[15] = (float)height + dy; - - if (vertex_buffer) { - glUnmapBuffer(GL_ARRAY_BUFFER); - } - } - - GLuint vertex_array_object; - GLuint position_attribute, texcoord_attribute; - - glGenVertexArrays(1, &vertex_array_object); - glBindVertexArray(vertex_array_object); - - texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); - position_attribute = glGetAttribLocation(shader_program, "pos"); - - glEnableVertexAttribArray(texcoord_attribute); - glEnableVertexAttribArray(position_attribute); - - glVertexAttribPointer( - texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); - glVertexAttribPointer(position_attribute, - 2, - GL_FLOAT, - GL_FALSE, - 4 * sizeof(float), - (const GLvoid *)(sizeof(float) * 2)); - - glDrawArrays(GL_TRIANGLE_FAN, 0, 4); - - if (vertex_buffer) { - glBindBuffer(GL_ARRAY_BUFFER, 0); - } - - if (use_fallback_shader) { - glUseProgram(0); - } - else { - draw_params.unbind_display_space_shader_cb(); - } - - glDeleteVertexArrays(1, &vertex_array_object); - glBindTexture(GL_TEXTURE_2D, 0); - glDeleteTextures(1, &texid); - - if (transparent) { - glDisable(GL_BLEND); - } } void Device::build_bvh(BVH *bvh, Progress &progress, bool refit) @@ -379,14 +68,14 @@ void Device::build_bvh(BVH *bvh, Progress &progress, bool refit) } } -Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) +Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler) { #ifdef WITH_MULTI if (!info.multi_devices.empty()) { /* Always create a multi device when info contains multiple devices. * This is done so that the type can still be e.g. DEVICE_CPU to indicate * that it is a homogeneous collection of devices, which simplifies checks. */ - return device_multi_create(info, stats, profiler, background); + return device_multi_create(info, stats, profiler); } #endif @@ -394,29 +83,18 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool switch (info.type) { case DEVICE_CPU: - device = device_cpu_create(info, stats, profiler, background); + device = device_cpu_create(info, stats, profiler); break; #ifdef WITH_CUDA case DEVICE_CUDA: if (device_cuda_init()) - device = device_cuda_create(info, stats, profiler, background); + device = device_cuda_create(info, stats, profiler); break; #endif #ifdef WITH_OPTIX case DEVICE_OPTIX: if (device_optix_init()) - device = device_optix_create(info, stats, profiler, background); - break; -#endif -#ifdef WITH_NETWORK - case DEVICE_NETWORK: - device = device_network_create(info, stats, profiler, "127.0.0.1"); - break; -#endif -#ifdef WITH_OPENCL - case DEVICE_OPENCL: - if (device_opencl_init()) - device = device_opencl_create(info, stats, profiler, background); + device = device_optix_create(info, stats, profiler); break; #endif default: @@ -424,7 +102,7 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool } if (device == NULL) { - device = device_dummy_create(info, stats, profiler, background); + device = device_dummy_create(info, stats, profiler); } return device; @@ -438,10 +116,6 @@ DeviceType Device::type_from_string(const char *name) return DEVICE_CUDA; else if (strcmp(name, "OPTIX") == 0) return DEVICE_OPTIX; - else if (strcmp(name, "OPENCL") == 0) - return DEVICE_OPENCL; - else if (strcmp(name, "NETWORK") == 0) - return DEVICE_NETWORK; else if (strcmp(name, "MULTI") == 0) return DEVICE_MULTI; @@ -456,10 +130,6 @@ string Device::string_from_type(DeviceType type) return "CUDA"; else if (type == DEVICE_OPTIX) return "OPTIX"; - else if (type == DEVICE_OPENCL) - return "OPENCL"; - else if (type == DEVICE_NETWORK) - return "NETWORK"; else if (type == DEVICE_MULTI) return "MULTI"; @@ -476,12 +146,6 @@ vector<DeviceType> Device::available_types() #ifdef WITH_OPTIX types.push_back(DEVICE_OPTIX); #endif -#ifdef WITH_OPENCL - types.push_back(DEVICE_OPENCL); -#endif -#ifdef WITH_NETWORK - types.push_back(DEVICE_NETWORK); -#endif return types; } @@ -493,20 +157,6 @@ vector<DeviceInfo> Device::available_devices(uint mask) thread_scoped_lock lock(device_mutex); vector<DeviceInfo> devices; -#ifdef WITH_OPENCL - if (mask & DEVICE_MASK_OPENCL) { - if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) { - if (device_opencl_init()) { - device_opencl_info(opencl_devices); - } - devices_initialized_mask |= DEVICE_MASK_OPENCL; - } - foreach (DeviceInfo &info, opencl_devices) { - devices.push_back(info); - } - } -#endif - #if defined(WITH_CUDA) || defined(WITH_OPTIX) if (mask & (DEVICE_MASK_CUDA | DEVICE_MASK_OPTIX)) { if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) { @@ -547,18 +197,6 @@ vector<DeviceInfo> Device::available_devices(uint mask) } } -#ifdef WITH_NETWORK - if (mask & DEVICE_MASK_NETWORK) { - if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) { - device_network_info(network_devices); - devices_initialized_mask |= DEVICE_MASK_NETWORK; - } - foreach (DeviceInfo &info, network_devices) { - devices.push_back(info); - } - } -#endif - return devices; } @@ -580,15 +218,6 @@ string Device::device_capabilities(uint mask) capabilities += device_cpu_capabilities() + "\n"; } -#ifdef WITH_OPENCL - if (mask & DEVICE_MASK_OPENCL) { - if (device_opencl_init()) { - capabilities += "\nOpenCL device capabilities:\n"; - capabilities += device_opencl_capabilities(); - } - } -#endif - #ifdef WITH_CUDA if (mask & DEVICE_MASK_CUDA) { if (device_cuda_init()) { @@ -613,16 +242,13 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, } DeviceInfo info; - info.type = subdevices.front().type; + info.type = DEVICE_NONE; info.id = "MULTI"; info.description = "Multi Device"; info.num = 0; info.has_half_images = true; info.has_nanovdb = true; - info.has_volume_decoupled = true; - info.has_branched_path = true; - info.has_adaptive_stop_per_sample = true; info.has_osl = true; info.has_profiling = true; info.has_peer_memory = false; @@ -660,16 +286,16 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.id += device.id; /* Set device type to MULTI if subdevices are not of a common type. */ - if (device.type != info.type) { + if (info.type == DEVICE_NONE) { + info.type = device.type; + } + else if (device.type != info.type) { info.type = DEVICE_MULTI; } /* Accumulate device info. */ info.has_half_images &= device.has_half_images; info.has_nanovdb &= device.has_nanovdb; - info.has_volume_decoupled &= device.has_volume_decoupled; - info.has_branched_path &= device.has_branched_path; - info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample; info.has_osl &= device.has_osl; info.has_profiling &= device.has_profiling; info.has_peer_memory |= device.has_peer_memory; @@ -689,60 +315,32 @@ void Device::free_memory() devices_initialized_mask = 0; cuda_devices.free_memory(); optix_devices.free_memory(); - opencl_devices.free_memory(); cpu_devices.free_memory(); - network_devices.free_memory(); } -/* DeviceInfo */ - -void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type) +unique_ptr<DeviceQueue> Device::gpu_queue_create() { - assert(denoising_devices.empty()); - - if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) { - vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX); - if (!optix_devices.empty()) { - /* Convert to a special multi device with separate denoising devices. */ - if (multi_devices.empty()) { - multi_devices.push_back(*this); - } - - /* Try to use the same physical devices for denoising. */ - for (const DeviceInfo &cuda_device : multi_devices) { - if (cuda_device.type == DEVICE_CUDA) { - for (const DeviceInfo &optix_device : optix_devices) { - if (cuda_device.num == optix_device.num) { - id += optix_device.id; - denoising_devices.push_back(optix_device); - break; - } - } - } - } - - if (denoising_devices.empty()) { - /* Simply use the first available OptiX device. */ - const DeviceInfo optix_device = optix_devices.front(); - id += optix_device.id; /* Uniquely identify this special multi device. */ - denoising_devices.push_back(optix_device); - } + LOG(FATAL) << "Device does not support queues."; + return nullptr; +} - denoisers = denoiser_type; - } - } - else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) { - /* Convert to a special multi device with separate denoising devices. */ - if (multi_devices.empty()) { - multi_devices.push_back(*this); - } +const CPUKernels *Device::get_cpu_kernels() const +{ + LOG(FATAL) << "Device does not support CPU kernels."; + return nullptr; +} - /* Add CPU denoising devices. */ - DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front(); - denoising_devices.push_back(cpu_device); +void Device::get_cpu_kernel_thread_globals( + vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/) +{ + LOG(FATAL) << "Device does not support CPU kernels."; +} - denoisers = denoiser_type; - } +void *Device::get_cpu_osl_memory() +{ + return nullptr; } +/* DeviceInfo */ + CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index ecf79bcdfa6..399d5eb91df 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -21,31 +21,34 @@ #include "bvh/bvh_params.h" +#include "device/device_denoise.h" #include "device/device_memory.h" -#include "device/device_task.h" +#include "util/util_function.h" #include "util/util_list.h" +#include "util/util_logging.h" #include "util/util_stats.h" #include "util/util_string.h" #include "util/util_texture.h" #include "util/util_thread.h" #include "util/util_types.h" +#include "util/util_unique_ptr.h" #include "util/util_vector.h" CCL_NAMESPACE_BEGIN class BVH; +class DeviceQueue; class Progress; -class RenderTile; +class CPUKernels; +class CPUKernelThreadGlobals; /* Device Types */ enum DeviceType { DEVICE_NONE = 0, DEVICE_CPU, - DEVICE_OPENCL, DEVICE_CUDA, - DEVICE_NETWORK, DEVICE_MULTI, DEVICE_OPTIX, DEVICE_DUMMY, @@ -53,20 +56,11 @@ enum DeviceType { enum DeviceTypeMask { DEVICE_MASK_CPU = (1 << DEVICE_CPU), - DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL), DEVICE_MASK_CUDA = (1 << DEVICE_CUDA), DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX), - DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK), DEVICE_MASK_ALL = ~0 }; -enum DeviceKernelStatus { - DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE, - DEVICE_KERNEL_USING_FEATURE_KERNEL, - DEVICE_KERNEL_FEATURE_KERNEL_INVALID, - DEVICE_KERNEL_UNKNOWN, -}; - #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type) class DeviceInfo { @@ -75,20 +69,16 @@ class DeviceInfo { string description; string id; /* used for user preferences, should stay fixed with changing hardware config */ int num; - bool display_device; /* GPU is used as a display device. */ - bool has_half_images; /* Support half-float textures. */ - bool has_nanovdb; /* Support NanoVDB volumes. */ - bool has_volume_decoupled; /* Decoupled volume shading. */ - bool has_branched_path; /* Supports branched path tracing. */ - bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */ - bool has_osl; /* Support Open Shading Language. */ - bool use_split_kernel; /* Use split or mega kernel. */ - bool has_profiling; /* Supports runtime collection of profiling info. */ - bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ - DenoiserTypeMask denoisers; /* Supported denoiser types. */ + bool display_device; /* GPU is used as a display device. */ + bool has_nanovdb; /* Support NanoVDB volumes. */ + bool has_half_images; /* Support half-float textures. */ + bool has_osl; /* Support Open Shading Language. */ + bool has_profiling; /* Supports runtime collection of profiling info. */ + bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ + bool has_gpu_queue; /* Device supports GPU queue. */ + DenoiserTypeMask denoisers; /* Supported denoiser types. */ int cpu_threads; vector<DeviceInfo> multi_devices; - vector<DeviceInfo> denoising_devices; string error_msg; DeviceInfo() @@ -100,227 +90,35 @@ class DeviceInfo { display_device = false; has_half_images = false; has_nanovdb = false; - has_volume_decoupled = false; - has_branched_path = true; - has_adaptive_stop_per_sample = false; has_osl = false; - use_split_kernel = false; has_profiling = false; has_peer_memory = false; + has_gpu_queue = false; denoisers = DENOISER_NONE; } - bool operator==(const DeviceInfo &info) + bool operator==(const DeviceInfo &info) const { /* Multiple Devices with the same ID would be very bad. */ assert(id != info.id || (type == info.type && num == info.num && description == info.description)); return id == info.id; } - - /* Add additional devices needed for the specified denoiser. */ - void add_denoising_devices(DenoiserType denoiser_type); -}; - -class DeviceRequestedFeatures { - public: - /* Use experimental feature set. */ - bool experimental; - - /* Selective nodes compilation. */ - - /* Identifier of a node group up to which all the nodes needs to be - * compiled in. Nodes from higher group indices will be ignores. - */ - int max_nodes_group; - - /* Features bitfield indicating which features from the requested group - * will be compiled in. Nodes which corresponds to features which are not - * in this bitfield will be ignored even if they're in the requested group. - */ - int nodes_features; - - /* BVH/sampling kernel features. */ - bool use_hair; - bool use_hair_thick; - bool use_object_motion; - bool use_camera_motion; - - /* Denotes whether baking functionality is needed. */ - bool use_baking; - - /* Use subsurface scattering materials. */ - bool use_subsurface; - - /* Use volume materials. */ - bool use_volume; - - /* Use branched integrator. */ - bool use_integrator_branched; - - /* Use OpenSubdiv patch evaluation */ - bool use_patch_evaluation; - - /* Use Transparent shadows */ - bool use_transparent; - - /* Use various shadow tricks, such as shadow catcher. */ - bool use_shadow_tricks; - - /* Per-uber shader usage flags. */ - bool use_principled; - - /* Denoising features. */ - bool use_denoising; - - /* Use raytracing in shaders. */ - bool use_shader_raytrace; - - /* Use true displacement */ - bool use_true_displacement; - - /* Use background lights */ - bool use_background_light; - - DeviceRequestedFeatures() - { - /* TODO(sergey): Find more meaningful defaults. */ - max_nodes_group = 0; - nodes_features = 0; - use_hair = false; - use_hair_thick = false; - use_object_motion = false; - use_camera_motion = false; - use_baking = false; - use_subsurface = false; - use_volume = false; - use_integrator_branched = false; - use_patch_evaluation = false; - use_transparent = false; - use_shadow_tricks = false; - use_principled = false; - use_denoising = false; - use_shader_raytrace = false; - use_true_displacement = false; - use_background_light = false; - } - - bool modified(const DeviceRequestedFeatures &requested_features) - { - return !(max_nodes_group == requested_features.max_nodes_group && - nodes_features == requested_features.nodes_features && - use_hair == requested_features.use_hair && - use_hair_thick == requested_features.use_hair_thick && - use_object_motion == requested_features.use_object_motion && - use_camera_motion == requested_features.use_camera_motion && - use_baking == requested_features.use_baking && - use_subsurface == requested_features.use_subsurface && - use_volume == requested_features.use_volume && - use_integrator_branched == requested_features.use_integrator_branched && - use_patch_evaluation == requested_features.use_patch_evaluation && - use_transparent == requested_features.use_transparent && - use_shadow_tricks == requested_features.use_shadow_tricks && - use_principled == requested_features.use_principled && - use_denoising == requested_features.use_denoising && - use_shader_raytrace == requested_features.use_shader_raytrace && - use_true_displacement == requested_features.use_true_displacement && - use_background_light == requested_features.use_background_light); - } - - /* Convert the requested features structure to a build options, - * which could then be passed to compilers. - */ - string get_build_options() const - { - string build_options = ""; - if (experimental) { - build_options += "-D__KERNEL_EXPERIMENTAL__ "; - } - build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group); - build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features); - if (!use_hair) { - build_options += " -D__NO_HAIR__"; - } - if (!use_object_motion) { - build_options += " -D__NO_OBJECT_MOTION__"; - } - if (!use_camera_motion) { - build_options += " -D__NO_CAMERA_MOTION__"; - } - if (!use_baking) { - build_options += " -D__NO_BAKING__"; - } - if (!use_volume) { - build_options += " -D__NO_VOLUME__"; - } - if (!use_subsurface) { - build_options += " -D__NO_SUBSURFACE__"; - } - if (!use_integrator_branched) { - build_options += " -D__NO_BRANCHED_PATH__"; - } - if (!use_patch_evaluation) { - build_options += " -D__NO_PATCH_EVAL__"; - } - if (!use_transparent && !use_volume) { - build_options += " -D__NO_TRANSPARENT__"; - } - if (!use_shadow_tricks) { - build_options += " -D__NO_SHADOW_TRICKS__"; - } - if (!use_principled) { - build_options += " -D__NO_PRINCIPLED__"; - } - if (!use_denoising) { - build_options += " -D__NO_DENOISING__"; - } - if (!use_shader_raytrace) { - build_options += " -D__NO_SHADER_RAYTRACE__"; - } - return build_options; - } }; -std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features); - /* Device */ -struct DeviceDrawParams { - function<void()> bind_display_space_shader_cb; - function<void()> unbind_display_space_shader_cb; -}; - class Device { friend class device_sub_ptr; protected: - enum { - FALLBACK_SHADER_STATUS_NONE = 0, - FALLBACK_SHADER_STATUS_ERROR, - FALLBACK_SHADER_STATUS_SUCCESS, - }; - - Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background) - : background(background), - vertex_buffer(0), - fallback_status(FALLBACK_SHADER_STATUS_NONE), - fallback_shader_program(0), - info(info_), - stats(stats_), - profiler(profiler_) + Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_) + : info(info_), stats(stats_), profiler(profiler_) { } - bool background; string error_msg; - /* used for real time display */ - unsigned int vertex_buffer; - int fallback_status, fallback_shader_program; - int image_texture_location, fullscreen_location; - - bool bind_fallback_display_space_shader(const float width, const float height); - virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/) { /* Only required for devices that implement denoising. */ @@ -361,67 +159,31 @@ class Device { Stats &stats; Profiler &profiler; - /* memory alignment */ - virtual int mem_sub_ptr_alignment() - { - return MIN_ALIGNMENT_CPU_DATA_TYPES; - } - /* constant memory */ virtual void const_copy_to(const char *name, void *host, size_t size) = 0; - /* open shading language, only for CPU device */ - virtual void *osl_memory() - { - return NULL; - } - /* load/compile kernels, must be called before adding tasks */ - virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/) + virtual bool load_kernels(uint /*kernel_features*/) { return true; } - /* Wait for device to become available to upload data and receive tasks - * This method is used by the OpenCL device to load the - * optimized kernels or when not (yet) available load the - * generic kernels (only during foreground rendering) */ - virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/) - { - return true; - } - /* Check if there are 'better' kernels available to be used - * We can switch over to these kernels - * This method is used to determine if we can switch the preview kernels - * to regular kernels */ - virtual DeviceKernelStatus get_active_kernel_switch_state() - { - return DEVICE_KERNEL_USING_FEATURE_KERNEL; - } + /* GPU device only functions. + * These may not be used on CPU or multi-devices. */ - /* tasks */ - virtual int get_split_task_count(DeviceTask &) - { - return 1; - } + /* Create new queue for executing kernels in. */ + virtual unique_ptr<DeviceQueue> gpu_queue_create(); + + /* CPU device only functions. + * These may not be used on GPU or multi-devices. */ - virtual void task_add(DeviceTask &task) = 0; - virtual void task_wait() = 0; - virtual void task_cancel() = 0; - - /* opengl drawing */ - virtual void draw_pixels(device_memory &mem, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params); + /* Get CPU kernel functions for native instruction set. */ + virtual const CPUKernels *get_cpu_kernels() const; + /* Get kernel globals to pass to kernels. */ + virtual void get_cpu_kernel_thread_globals( + vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/); + /* Get OpenShadingLanguage memory buffer. */ + virtual void *get_cpu_osl_memory(); /* acceleration structure building */ virtual void build_bvh(BVH *bvh, Progress &progress, bool refit); @@ -429,25 +191,11 @@ class Device { /* OptiX specific destructor. */ virtual void release_optix_bvh(BVH * /*bvh*/){}; -#ifdef WITH_NETWORK - /* networking */ - void server_run(); -#endif - /* multi device */ - virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/) - { - } virtual int device_number(Device * /*sub_device*/) { return 0; } - virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/) - { - } - virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/) - { - } virtual bool is_resident(device_ptr /*key*/, Device *sub_device) { @@ -460,11 +208,47 @@ class Device { return false; } + /* Graphics resources interoperability. + * + * The interoperability comes here by the meaning that the device is capable of computing result + * directly into an OpenGL (or other graphics library) buffer. */ + + /* Check display is to be updated using graphics interoperability. + * The interoperability can not be used is it is not supported by the device. But the device + * might also force disable the interoperability if it detects that it will be slower than + * copying pixels from the render buffer. */ + virtual bool should_use_graphics_interop() + { + return false; + } + + /* Buffer denoising. */ + + /* Returns true if task is fully handled. */ + virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/) + { + LOG(ERROR) << "Request buffer denoising from a device which does not support it."; + return false; + } + + virtual DeviceQueue *get_denoise_queue() + { + LOG(ERROR) << "Request denoising queue from a device which does not support it."; + return nullptr; + } + + /* Sub-devices */ + + /* Run given callback for every individual device which will be handling rendering. + * For the single device the callback is called for the device itself. For the multi-device the + * callback is only called for the sub-devices. */ + virtual void foreach_device(const function<void(Device *)> &callback) + { + callback(this); + } + /* static */ - static Device *create(DeviceInfo &info, - Stats &stats, - Profiler &profiler, - bool background = true); + static Device *create(const DeviceInfo &info, Stats &stats, Profiler &profiler); static DeviceType type_from_string(const char *name); static string string_from_type(DeviceType type); @@ -499,9 +283,7 @@ class Device { static thread_mutex device_mutex; static vector<DeviceInfo> cuda_devices; static vector<DeviceInfo> optix_devices; - static vector<DeviceInfo> opencl_devices; static vector<DeviceInfo> cpu_devices; - static vector<DeviceInfo> network_devices; static uint devices_initialized_mask; }; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp deleted file mode 100644 index 4a6e77d6eaa..00000000000 --- a/intern/cycles/device/device_cpu.cpp +++ /dev/null @@ -1,1680 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <stdlib.h> -#include <string.h> - -/* So ImathMath is included before our kernel_cpu_compat. */ -#ifdef WITH_OSL -/* So no context pollution happens from indirectly included windows.h */ -# include "util/util_windows.h" -# include <OSL/oslexec.h> -#endif - -#ifdef WITH_EMBREE -# include <embree3/rtcore.h> -#endif - -#include "device/device.h" -#include "device/device_denoising.h" -#include "device/device_intern.h" -#include "device/device_split_kernel.h" - -// clang-format off -#include "kernel/kernel.h" -#include "kernel/kernel_compat_cpu.h" -#include "kernel/kernel_types.h" -#include "kernel/split/kernel_split_data.h" -#include "kernel/kernel_globals.h" -#include "kernel/kernel_adaptive_sampling.h" - -#include "kernel/filter/filter.h" - -#include "kernel/osl/osl_shader.h" -#include "kernel/osl/osl_globals.h" -// clang-format on - -#include "bvh/bvh_embree.h" - -#include "render/buffers.h" -#include "render/coverage.h" - -#include "util/util_debug.h" -#include "util/util_foreach.h" -#include "util/util_function.h" -#include "util/util_logging.h" -#include "util/util_map.h" -#include "util/util_opengl.h" -#include "util/util_openimagedenoise.h" -#include "util/util_optimization.h" -#include "util/util_progress.h" -#include "util/util_system.h" -#include "util/util_task.h" -#include "util/util_thread.h" - -CCL_NAMESPACE_BEGIN - -class CPUDevice; - -/* Has to be outside of the class to be shared across template instantiations. */ -static const char *logged_architecture = ""; - -template<typename F> class KernelFunctions { - public: - KernelFunctions() - { - kernel = (F)NULL; - } - - KernelFunctions( - F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2) - { - const char *architecture_name = "default"; - kernel = kernel_default; - - /* Silence potential warnings about unused variables - * when compiling without some architectures. */ - (void)kernel_sse2; - (void)kernel_sse3; - (void)kernel_sse41; - (void)kernel_avx; - (void)kernel_avx2; -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { - architecture_name = "AVX2"; - kernel = kernel_avx2; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) { - architecture_name = "AVX"; - kernel = kernel_avx; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) { - architecture_name = "SSE4.1"; - kernel = kernel_sse41; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) { - architecture_name = "SSE3"; - kernel = kernel_sse3; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { - architecture_name = "SSE2"; - kernel = kernel_sse2; - } -#else - { - /* Dummy to prevent the architecture if below become - * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - * is not defined. */ - } -#endif - - if (strcmp(architecture_name, logged_architecture) != 0) { - VLOG(1) << "Will be using " << architecture_name << " kernels."; - logged_architecture = architecture_name; - } - } - - inline F operator()() const - { - assert(kernel); - return kernel; - } - - protected: - F kernel; -}; - -class CPUSplitKernel : public DeviceSplitKernel { - CPUDevice *device; - - public: - explicit CPUSplitKernel(CPUDevice *device); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &kernel_data_, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs); - - virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task); - virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); -}; - -class CPUDevice : public Device { - public: - TaskPool task_pool; - KernelGlobals kernel_globals; - - device_vector<TextureInfo> texture_info; - bool need_texture_info; - -#ifdef WITH_OSL - OSLGlobals osl_globals; -#endif -#ifdef WITH_OPENIMAGEDENOISE - oidn::DeviceRef oidn_device; - oidn::FilterRef oidn_filter; -#endif - thread_spin_lock oidn_task_lock; -#ifdef WITH_EMBREE - RTCScene embree_scene = NULL; - RTCDevice embree_device; -#endif - - bool use_split_kernel; - - DeviceRequestedFeatures requested_features; - - KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel; - KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> - convert_to_half_float_kernel; - KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> - convert_to_byte_kernel; - KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> - shader_kernel; - KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel; - - KernelFunctions<void (*)( - int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)> - filter_divide_shadow_kernel; - KernelFunctions<void (*)( - int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)> - filter_get_feature_kernel; - KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)> - filter_write_feature_kernel; - KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)> - filter_detect_outliers_kernel; - KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)> - filter_combine_halves_kernel; - - KernelFunctions<void (*)( - int, int, float *, float *, float *, float *, int *, int, int, int, float, float)> - filter_nlm_calc_difference_kernel; - KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel; - KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel; - KernelFunctions<void (*)( - int, int, float *, float *, float *, float *, float *, int *, int, int, int)> - filter_nlm_update_output_kernel; - KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel; - - KernelFunctions<void (*)( - float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)> - filter_construct_transform_kernel; - KernelFunctions<void (*)(int, - int, - int, - float *, - float *, - float *, - int *, - float *, - float3 *, - int *, - int *, - int, - int, - int, - int, - bool)> - filter_nlm_construct_gramian_kernel; - KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)> - filter_finalize_kernel; - - KernelFunctions<void (*)(KernelGlobals *, - ccl_constant KernelData *, - ccl_global void *, - int, - ccl_global char *, - int, - int, - int, - int, - int, - int, - int, - int, - ccl_global int *, - int, - ccl_global char *, - ccl_global unsigned int *, - unsigned int, - ccl_global float *)> - data_init_kernel; - unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels; - -#define KERNEL_FUNCTIONS(name) \ - KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \ - KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \ - KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name) - - CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) - : Device(info_, stats_, profiler_, background_), - texture_info(this, "__texture_info", MEM_GLOBAL), -#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name)) - REGISTER_KERNEL(path_trace), - REGISTER_KERNEL(convert_to_half_float), - REGISTER_KERNEL(convert_to_byte), - REGISTER_KERNEL(shader), - REGISTER_KERNEL(bake), - REGISTER_KERNEL(filter_divide_shadow), - REGISTER_KERNEL(filter_get_feature), - REGISTER_KERNEL(filter_write_feature), - REGISTER_KERNEL(filter_detect_outliers), - REGISTER_KERNEL(filter_combine_halves), - REGISTER_KERNEL(filter_nlm_calc_difference), - REGISTER_KERNEL(filter_nlm_blur), - REGISTER_KERNEL(filter_nlm_calc_weight), - REGISTER_KERNEL(filter_nlm_update_output), - REGISTER_KERNEL(filter_nlm_normalize), - REGISTER_KERNEL(filter_construct_transform), - REGISTER_KERNEL(filter_nlm_construct_gramian), - REGISTER_KERNEL(filter_finalize), - REGISTER_KERNEL(data_init) -#undef REGISTER_KERNEL - { - if (info.cpu_threads == 0) { - info.cpu_threads = TaskScheduler::num_threads(); - } - -#ifdef WITH_OSL - kernel_globals.osl = &osl_globals; -#endif -#ifdef WITH_EMBREE - embree_device = rtcNewDevice("verbose=0"); -#endif - use_split_kernel = DebugFlags().cpu.split_kernel; - if (use_split_kernel) { - VLOG(1) << "Will be using split kernel."; - } - need_texture_info = false; - -#define REGISTER_SPLIT_KERNEL(name) \ - split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \ - KERNEL_FUNCTIONS(name)) - REGISTER_SPLIT_KERNEL(path_init); - REGISTER_SPLIT_KERNEL(scene_intersect); - REGISTER_SPLIT_KERNEL(lamp_emission); - REGISTER_SPLIT_KERNEL(do_volume); - REGISTER_SPLIT_KERNEL(queue_enqueue); - REGISTER_SPLIT_KERNEL(indirect_background); - REGISTER_SPLIT_KERNEL(shader_setup); - REGISTER_SPLIT_KERNEL(shader_sort); - REGISTER_SPLIT_KERNEL(shader_eval); - REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao); - REGISTER_SPLIT_KERNEL(subsurface_scatter); - REGISTER_SPLIT_KERNEL(direct_lighting); - REGISTER_SPLIT_KERNEL(shadow_blocked_ao); - REGISTER_SPLIT_KERNEL(shadow_blocked_dl); - REGISTER_SPLIT_KERNEL(enqueue_inactive); - REGISTER_SPLIT_KERNEL(next_iteration_setup); - REGISTER_SPLIT_KERNEL(indirect_subsurface); - REGISTER_SPLIT_KERNEL(buffer_update); - REGISTER_SPLIT_KERNEL(adaptive_stopping); - REGISTER_SPLIT_KERNEL(adaptive_filter_x); - REGISTER_SPLIT_KERNEL(adaptive_filter_y); - REGISTER_SPLIT_KERNEL(adaptive_adjust_samples); -#undef REGISTER_SPLIT_KERNEL -#undef KERNEL_FUNCTIONS - } - - ~CPUDevice() - { -#ifdef WITH_EMBREE - rtcReleaseDevice(embree_device); -#endif - task_pool.cancel(); - texture_info.free(); - } - - virtual bool show_samples() const override - { - return (info.cpu_threads == 1); - } - - virtual BVHLayoutMask get_bvh_layout_mask() const override - { - BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; -#ifdef WITH_EMBREE - bvh_layout_mask |= BVH_LAYOUT_EMBREE; -#endif /* WITH_EMBREE */ - return bvh_layout_mask; - } - - void load_texture_info() - { - if (need_texture_info) { - texture_info.copy_to_device(); - need_texture_info = false; - } - } - - virtual void mem_alloc(device_memory &mem) override - { - if (mem.type == MEM_TEXTURE) { - assert(!"mem_alloc not supported for textures."); - } - else if (mem.type == MEM_GLOBAL) { - assert(!"mem_alloc not supported for global memory."); - } - else { - if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) { - size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES; - void *data = util_aligned_malloc(mem.memory_size(), alignment); - mem.device_pointer = (device_ptr)data; - } - else { - mem.device_pointer = (device_ptr)mem.host_pointer; - } - - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - } - } - - virtual void mem_copy_to(device_memory &mem) override - { - if (mem.type == MEM_GLOBAL) { - global_free(mem); - global_alloc(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - tex_alloc((device_texture &)mem); - } - else if (mem.type == MEM_PIXELS) { - assert(!"mem_copy_to not supported for pixels."); - } - else { - if (!mem.device_pointer) { - mem_alloc(mem); - } - - /* copy is no-op */ - } - } - - virtual void mem_copy_from( - device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) override - { - /* no-op */ - } - - virtual void mem_zero(device_memory &mem) override - { - if (!mem.device_pointer) { - mem_alloc(mem); - } - - if (mem.device_pointer) { - memset((void *)mem.device_pointer, 0, mem.memory_size()); - } - } - - virtual void mem_free(device_memory &mem) override - { - if (mem.type == MEM_GLOBAL) { - global_free(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - } - else if (mem.device_pointer) { - if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) { - util_aligned_free((void *)mem.device_pointer); - } - mem.device_pointer = 0; - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } - - virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override - { - return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); - } - - virtual void const_copy_to(const char *name, void *host, size_t size) override - { -#if WITH_EMBREE - if (strcmp(name, "__data") == 0) { - assert(size <= sizeof(KernelData)); - - // Update scene handle (since it is different for each device on multi devices) - KernelData *const data = (KernelData *)host; - data->bvh.scene = embree_scene; - } -#endif - kernel_const_copy(&kernel_globals, name, host, size); - } - - void global_alloc(device_memory &mem) - { - VLOG(1) << "Global memory allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size); - - mem.device_pointer = (device_ptr)mem.host_pointer; - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - } - - void global_free(device_memory &mem) - { - if (mem.device_pointer) { - mem.device_pointer = 0; - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } - - void tex_alloc(device_texture &mem) - { - VLOG(1) << "Texture allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - mem.device_pointer = (device_ptr)mem.host_pointer; - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - - const uint slot = mem.slot; - if (slot >= texture_info.size()) { - /* Allocate some slots in advance, to reduce amount of re-allocations. */ - texture_info.resize(slot + 128); - } - - texture_info[slot] = mem.info; - texture_info[slot].data = (uint64_t)mem.host_pointer; - need_texture_info = true; - } - - void tex_free(device_texture &mem) - { - if (mem.device_pointer) { - mem.device_pointer = 0; - stats.mem_free(mem.device_size); - mem.device_size = 0; - need_texture_info = true; - } - } - - virtual void *osl_memory() override - { -#ifdef WITH_OSL - return &osl_globals; -#else - return NULL; -#endif - } - - void build_bvh(BVH *bvh, Progress &progress, bool refit) override - { -#ifdef WITH_EMBREE - if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE || - bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) { - BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh); - if (refit) { - bvh_embree->refit(progress); - } - else { - bvh_embree->build(progress, &stats, embree_device); - } - - if (bvh->params.top_level) { - embree_scene = bvh_embree->scene; - } - } - else -#endif - Device::build_bvh(bvh, progress, refit); - } - - void thread_run(DeviceTask &task) - { - if (task.type == DeviceTask::RENDER) - thread_render(task); - else if (task.type == DeviceTask::SHADER) - thread_shader(task); - else if (task.type == DeviceTask::FILM_CONVERT) - thread_film_convert(task); - else if (task.type == DeviceTask::DENOISE_BUFFER) - thread_denoise(task); - } - - bool denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS); - - int4 rect = task->rect; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int w = align_up(rect.z - rect.x, 4); - int h = rect.w - rect.y; - int stride = task->buffer.stride; - int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; - - float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer; - float *blurDifference = temporary_mem; - float *difference = temporary_mem + task->buffer.pass_stride; - float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride; - - memset(weightAccum, 0, sizeof(float) * w * h); - memset((float *)out_ptr, 0, sizeof(float) * w * h); - - for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) { - int dy = i / (2 * r + 1) - r; - int dx = i % (2 * r + 1) - r; - - int local_rect[4] = { - max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, - dy, - (float *)guide_ptr, - (float *)variance_ptr, - NULL, - difference, - local_rect, - w, - channel_offset, - 0, - a, - k_2); - - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - - filter_nlm_update_output_kernel()(dx, - dy, - blurDifference, - (float *)image_ptr, - difference, - (float *)out_ptr, - weightAccum, - local_rect, - channel_offset, - stride, - f); - } - - int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y}; - filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w); - - return true; - } - - bool denoising_construct_transform(DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM); - - for (int y = 0; y < task->filter_area.w; y++) { - for (int x = 0; x < task->filter_area.z; x++) { - filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer, - task->tile_info, - x + task->filter_area.x, - y + task->filter_area.y, - y * task->filter_area.z + x, - (float *)task->storage.transform.device_pointer, - (int *)task->storage.rank.device_pointer, - &task->rect.x, - task->buffer.pass_stride, - task->buffer.frame_stride, - task->buffer.use_time, - task->radius, - task->pca_threshold); - } - } - return true; - } - - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); - - float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer; - float *difference = temporary_mem; - float *blurDifference = temporary_mem + task->buffer.pass_stride; - - int r = task->radius; - int frame_offset = frame * task->buffer.frame_stride; - for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) { - int dy = i / (2 * r + 1) - r; - int dx = i % (2 * r + 1) - r; - - int local_rect[4] = {max(0, -dx), - max(0, -dy), - task->reconstruction_state.source_w - max(0, dx), - task->reconstruction_state.source_h - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, - dy, - (float *)color_ptr, - (float *)color_variance_ptr, - (float *)scale_ptr, - difference, - local_rect, - task->buffer.stride, - task->buffer.pass_stride, - frame_offset, - 1.0f, - task->nlm_k_2); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); - filter_nlm_calc_weight_kernel()( - blurDifference, difference, local_rect, task->buffer.stride, 4); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); - filter_nlm_construct_gramian_kernel()(dx, - dy, - task->tile_info->frames[frame], - blurDifference, - (float *)task->buffer.mem.device_pointer, - (float *)task->storage.transform.device_pointer, - (int *)task->storage.rank.device_pointer, - (float *)task->storage.XtWX.device_pointer, - (float3 *)task->storage.XtWY.device_pointer, - local_rect, - &task->reconstruction_state.filter_window.x, - task->buffer.stride, - 4, - task->buffer.pass_stride, - frame_offset, - task->buffer.use_time); - } - - return true; - } - - bool denoising_solve(device_ptr output_ptr, DenoisingTask *task) - { - for (int y = 0; y < task->filter_area.w; y++) { - for (int x = 0; x < task->filter_area.z; x++) { - filter_finalize_kernel()(x, - y, - y * task->filter_area.z + x, - (float *)output_ptr, - (int *)task->storage.rank.device_pointer, - (float *)task->storage.XtWX.device_pointer, - (float3 *)task->storage.XtWY.device_pointer, - &task->reconstruction_state.buffer_params.x, - task->render_buffer.samples); - } - } - return true; - } - - bool denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES); - - for (int y = rect.y; y < rect.w; y++) { - for (int x = rect.x; x < rect.z; x++) { - filter_combine_halves_kernel()(x, - y, - (float *)mean_ptr, - (float *)variance_ptr, - (float *)a_ptr, - (float *)b_ptr, - &rect.x, - r); - } - } - return true; - } - - bool denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW); - - for (int y = task->rect.y; y < task->rect.w; y++) { - for (int x = task->rect.x; x < task->rect.z; x++) { - filter_divide_shadow_kernel()(task->render_buffer.samples, - task->tile_info, - x, - y, - (float *)a_ptr, - (float *)b_ptr, - (float *)sample_variance_ptr, - (float *)sv_variance_ptr, - (float *)buffer_variance_ptr, - &task->rect.x, - task->render_buffer.pass_stride, - task->render_buffer.offset); - } - } - return true; - } - - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE); - - for (int y = task->rect.y; y < task->rect.w; y++) { - for (int x = task->rect.x; x < task->rect.z; x++) { - filter_get_feature_kernel()(task->render_buffer.samples, - task->tile_info, - mean_offset, - variance_offset, - x, - y, - (float *)mean_ptr, - (float *)variance_ptr, - scale, - &task->rect.x, - task->render_buffer.pass_stride, - task->render_buffer.offset); - } - } - return true; - } - - bool denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) - { - for (int y = 0; y < task->filter_area.w; y++) { - for (int x = 0; x < task->filter_area.z; x++) { - filter_write_feature_kernel()(task->render_buffer.samples, - x + task->filter_area.x, - y + task->filter_area.y, - &task->reconstruction_state.buffer_params.x, - (float *)from_ptr, - (float *)buffer_ptr, - out_offset, - &task->rect.x); - } - } - return true; - } - - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS); - - for (int y = task->rect.y; y < task->rect.w; y++) { - for (int x = task->rect.x; x < task->rect.z; x++) { - filter_detect_outliers_kernel()(x, - y, - (float *)image_ptr, - (float *)variance_ptr, - (float *)depth_ptr, - (float *)output_ptr, - &task->rect.x, - task->buffer.pass_stride); - } - } - return true; - } - - bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample) - { - WorkTile wtile; - wtile.x = tile.x; - wtile.y = tile.y; - wtile.w = tile.w; - wtile.h = tile.h; - wtile.offset = tile.offset; - wtile.stride = tile.stride; - wtile.buffer = (float *)tile.buffer; - - /* For CPU we do adaptive stopping per sample so we can stop earlier, but - * for combined CPU + GPU rendering we match the GPU and do it per tile - * after a given number of sample steps. */ - if (!kernel_data.integrator.adaptive_stop_per_sample) { - for (int y = wtile.y; y < wtile.y + wtile.h; ++y) { - for (int x = wtile.x; x < wtile.x + wtile.w; ++x) { - const int index = wtile.offset + x + y * wtile.stride; - float *buffer = wtile.buffer + index * kernel_data.film.pass_stride; - kernel_do_adaptive_stopping(kg, buffer, sample); - } - } - } - - bool any = false; - for (int y = wtile.y; y < wtile.y + wtile.h; ++y) { - any |= kernel_do_adaptive_filter_x(kg, y, &wtile); - } - for (int x = wtile.x; x < wtile.x + wtile.w; ++x) { - any |= kernel_do_adaptive_filter_y(kg, x, &wtile); - } - return (!any); - } - - void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg) - { - float *render_buffer = (float *)tile.buffer; - for (int y = tile.y; y < tile.y + tile.h; y++) { - for (int x = tile.x; x < tile.x + tile.w; x++) { - int index = tile.offset + x + y * tile.stride; - ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; - if (buffer[kernel_data.film.pass_sample_count] < 0.0f) { - buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; - float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count]; - if (sample_multiplier != 1.0f) { - kernel_adaptive_post_adjust(kg, buffer, sample_multiplier); - } - } - else { - kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f)); - } - } - } - } - - void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) - { - const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; - - scoped_timer timer(&tile.buffers->render_time); - - Coverage coverage(kg, tile); - if (use_coverage) { - coverage.init_path_trace(); - } - - float *render_buffer = (float *)tile.buffer; - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - /* Needed for Embree. */ - SIMD_SET_FLUSH_TO_ZERO; - - for (int sample = start_sample; sample < end_sample; sample++) { - if (task.get_cancel() || TaskPool::canceled()) { - if (task.need_finish_queue == false) - break; - } - - if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) { - tile.stealing_state = RenderTile::WAS_STOLEN; - break; - } - - if (tile.task == RenderTile::PATH_TRACE) { - for (int y = tile.y; y < tile.y + tile.h; y++) { - for (int x = tile.x; x < tile.x + tile.w; x++) { - if (use_coverage) { - coverage.init_pixel(x, y); - } - path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); - } - } - } - else { - for (int y = tile.y; y < tile.y + tile.h; y++) { - for (int x = tile.x; x < tile.x + tile.w; x++) { - bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); - } - } - } - tile.sample = sample + 1; - - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) { - const bool stop = adaptive_sampling_filter(kg, tile, sample); - if (stop) { - const int num_progress_samples = end_sample - sample; - tile.sample = end_sample; - task.update_progress(&tile, tile.w * tile.h * num_progress_samples); - break; - } - } - - task.update_progress(&tile, tile.w * tile.h); - } - if (use_coverage) { - coverage.finalize(); - } - - if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) { - adaptive_sampling_post(tile, kg); - } - } - - void denoise_openimagedenoise_buffer(DeviceTask &task, - float *buffer, - const size_t offset, - const size_t stride, - const size_t x, - const size_t y, - const size_t w, - const size_t h, - const float scale) - { -#ifdef WITH_OPENIMAGEDENOISE - assert(openimagedenoise_supported()); - - /* Only one at a time, since OpenImageDenoise itself is multithreaded for full - * buffers, and for tiled rendering because creating multiple devices and filters - * is slow and memory hungry as well. - * - * TODO: optimize tiled rendering case, by batching together denoising of many - * tiles somehow? */ - static thread_mutex mutex; - thread_scoped_lock lock(mutex); - - /* Create device and filter, cached for reuse. */ - if (!oidn_device) { - oidn_device = oidn::newDevice(); - oidn_device.commit(); - } - if (!oidn_filter) { - oidn_filter = oidn_device.newFilter("RT"); - oidn_filter.set("hdr", true); - oidn_filter.set("srgb", false); - } - - /* Set images with appropriate stride for our interleaved pass storage. */ - struct { - const char *name; - const int offset; - const bool scale; - const bool use; - array<float> scaled_buffer; - } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true}, - {"albedo", - task.pass_denoising_data + DENOISING_PASS_ALBEDO, - true, - task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO}, - {"normal", - task.pass_denoising_data + DENOISING_PASS_NORMAL, - true, - task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL}, - {"output", 0, false, true}, - { NULL, - 0 }}; - - for (int i = 0; passes[i].name; i++) { - if (!passes[i].use) { - continue; - } - - const int64_t pixel_offset = offset + x + y * stride; - const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset); - const int64_t pixel_stride = task.pass_stride; - const int64_t row_stride = stride * pixel_stride; - - if (passes[i].scale && scale != 1.0f) { - /* Normalize albedo and normal passes as they are scaled by the number of samples. - * For the color passes OIDN will perform auto-exposure making it unnecessary. */ - array<float> &scaled_buffer = passes[i].scaled_buffer; - scaled_buffer.resize(w * h * 3); - - for (int y = 0; y < h; y++) { - const float *pass_row = buffer + buffer_offset + y * row_stride; - float *scaled_row = scaled_buffer.data() + y * w * 3; - - for (int x = 0; x < w; x++) { - scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale; - scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale; - scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale; - } - } - - oidn_filter.setImage( - passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0); - } - else { - oidn_filter.setImage(passes[i].name, - buffer + buffer_offset, - oidn::Format::Float3, - w, - h, - 0, - pixel_stride * sizeof(float), - row_stride * sizeof(float)); - } - } - - /* Execute filter. */ - oidn_filter.commit(); - oidn_filter.execute(); -#else - (void)task; - (void)buffer; - (void)offset; - (void)stride; - (void)x; - (void)y; - (void)w; - (void)h; - (void)scale; -#endif - } - - void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile) - { - if (task.type == DeviceTask::DENOISE_BUFFER) { - /* Copy pixels from compute device to CPU (no-op for CPU device). */ - rtile.buffers->buffer.copy_from_device(); - - denoise_openimagedenoise_buffer(task, - (float *)rtile.buffer, - rtile.offset, - rtile.stride, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - 1.0f / rtile.sample); - - /* todo: it may be possible to avoid this copy, but we have to ensure that - * when other code copies data from the device it doesn't overwrite the - * denoiser buffers. */ - rtile.buffers->buffer.copy_to_device(); - } - else { - /* Per-tile denoising. */ - rtile.sample = rtile.start_sample + rtile.num_samples; - const float scale = 1.0f / rtile.sample; - const float invscale = rtile.sample; - const size_t pass_stride = task.pass_stride; - - /* Map neighboring tiles into one buffer for denoising. */ - RenderTileNeighbors neighbors(rtile); - task.map_neighbor_tiles(neighbors, this); - RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER]; - rtile = center_tile; - - /* Calculate size of the tile to denoise (including overlap). The overlap - * size was chosen empirically. OpenImageDenoise specifies an overlap size - * of 128 but this is significantly bigger than typical tile size. */ - const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds()); - const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); - - /* Adjacent tiles are in separate memory regions, copy into single buffer. */ - array<float> merged(rect_size.x * rect_size.y * task.pass_stride); - - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - RenderTile &ntile = neighbors.tiles[i]; - if (!ntile.buffer) { - continue; - } - - const int xmin = max(ntile.x, rect.x); - const int ymin = max(ntile.y, rect.y); - const int xmax = min(ntile.x + ntile.w, rect.z); - const int ymax = min(ntile.y + ntile.h, rect.w); - - const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride; - const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride; - - const size_t merged_stride = rect_size.x; - const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride; - float *merged_buffer = merged.data() + merged_offset * pass_stride; - - for (int y = ymin; y < ymax; y++) { - for (int x = 0; x < pass_stride * (xmax - xmin); x++) { - merged_buffer[x] = tile_buffer[x] * scale; - } - tile_buffer += ntile.stride * pass_stride; - merged_buffer += merged_stride * pass_stride; - } - } - - /* Denoise */ - denoise_openimagedenoise_buffer( - task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f); - - /* Copy back result from merged buffer. */ - RenderTile &ntile = neighbors.target; - if (ntile.buffer) { - const int xmin = max(ntile.x, rect.x); - const int ymin = max(ntile.y, rect.y); - const int xmax = min(ntile.x + ntile.w, rect.z); - const int ymax = min(ntile.y + ntile.h, rect.w); - - const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride; - float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride; - - const size_t merged_stride = rect_size.x; - const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride; - const float *merged_buffer = merged.data() + merged_offset * pass_stride; - - for (int y = ymin; y < ymax; y++) { - for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) { - tile_buffer[x + 0] = merged_buffer[x + 0] * invscale; - tile_buffer[x + 1] = merged_buffer[x + 1] * invscale; - tile_buffer[x + 2] = merged_buffer[x + 2] * invscale; - } - tile_buffer += ntile.stride * pass_stride; - merged_buffer += merged_stride * pass_stride; - } - } - - task.unmap_neighbor_tiles(neighbors, this); - } - } - - void denoise_nlm(DenoisingTask &denoising, RenderTile &tile) - { - ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); - - tile.sample = tile.start_sample + tile.num_samples; - - denoising.functions.construct_transform = function_bind( - &CPUDevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind( - &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind( - &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind( - &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind( - &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind( - &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind( - &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind( - &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); - denoising.render_buffer.samples = tile.sample; - denoising.buffer.gpu_temporary_mem = false; - - denoising.run_denoising(tile); - } - - void thread_render(DeviceTask &task) - { - if (TaskPool::canceled()) { - if (task.need_finish_queue == false) - return; - } - - /* allocate buffer for kernel globals */ - device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals"); - kgbuffer.alloc_to_device(1); - - KernelGlobals *kg = new ((void *)kgbuffer.device_pointer) - KernelGlobals(thread_kernel_globals_init()); - - profiler.add_state(&kg->profiler); - - CPUSplitKernel *split_kernel = NULL; - if (use_split_kernel) { - split_kernel = new CPUSplitKernel(this); - if (!split_kernel->load_kernels(requested_features)) { - thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer); - kgbuffer.free(); - delete split_kernel; - return; - } - } - - /* NLM denoiser. */ - DenoisingTask *denoising = NULL; - - /* OpenImageDenoise: we can only denoise with one thread at a time, so to - * avoid waiting with mutex locks in the denoiser, we let only a single - * thread acquire denoising tiles. */ - uint tile_types = task.tile_types; - bool hold_denoise_lock = false; - if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) { - if (!oidn_task_lock.try_lock()) { - tile_types &= ~RenderTile::DENOISE; - hold_denoise_lock = true; - } - } - - RenderTile tile; - while (task.acquire_tile(this, tile, tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - if (use_split_kernel) { - device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(task, tile, kgbuffer, void_buffer); - } - else { - render(task, tile, kg); - } - } - else if (tile.task == RenderTile::BAKE) { - render(task, tile, kg); - } - else if (tile.task == RenderTile::DENOISE) { - if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) { - denoise_openimagedenoise(task, tile); - } - else if (task.denoising.type == DENOISER_NLM) { - if (denoising == NULL) { - denoising = new DenoisingTask(this, task); - denoising->profiler = &kg->profiler; - } - denoise_nlm(*denoising, tile); - } - task.update_progress(&tile, tile.w * tile.h); - } - - task.release_tile(tile); - - if (TaskPool::canceled()) { - if (task.need_finish_queue == false) - break; - } - } - - if (hold_denoise_lock) { - oidn_task_lock.unlock(); - } - - profiler.remove_state(&kg->profiler); - - thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer); - kg->~KernelGlobals(); - kgbuffer.free(); - delete split_kernel; - delete denoising; - } - - void thread_denoise(DeviceTask &task) - { - RenderTile tile; - tile.x = task.x; - tile.y = task.y; - tile.w = task.w; - tile.h = task.h; - tile.buffer = task.buffer; - tile.sample = task.sample + task.num_samples; - tile.num_samples = task.num_samples; - tile.start_sample = task.sample; - tile.offset = task.offset; - tile.stride = task.stride; - tile.buffers = task.buffers; - - if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) { - denoise_openimagedenoise(task, tile); - } - else { - DenoisingTask denoising(this, task); - - ProfilingState denoising_profiler_state; - profiler.add_state(&denoising_profiler_state); - denoising.profiler = &denoising_profiler_state; - - denoise_nlm(denoising, tile); - - profiler.remove_state(&denoising_profiler_state); - } - - task.update_progress(&tile, tile.w * tile.h); - } - - void thread_film_convert(DeviceTask &task) - { - float sample_scale = 1.0f / (task.sample + 1); - - if (task.rgba_half) { - for (int y = task.y; y < task.y + task.h; y++) - for (int x = task.x; x < task.x + task.w; x++) - convert_to_half_float_kernel()(&kernel_globals, - (uchar4 *)task.rgba_half, - (float *)task.buffer, - sample_scale, - x, - y, - task.offset, - task.stride); - } - else { - for (int y = task.y; y < task.y + task.h; y++) - for (int x = task.x; x < task.x + task.w; x++) - convert_to_byte_kernel()(&kernel_globals, - (uchar4 *)task.rgba_byte, - (float *)task.buffer, - sample_scale, - x, - y, - task.offset, - task.stride); - } - } - - void thread_shader(DeviceTask &task) - { - KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init()); - - for (int sample = 0; sample < task.num_samples; sample++) { - for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++) - shader_kernel()(kg, - (uint4 *)task.shader_input, - (float4 *)task.shader_output, - task.shader_eval_type, - task.shader_filter, - x, - task.offset, - sample); - - if (task.get_cancel() || TaskPool::canceled()) - break; - - task.update_progress(NULL); - } - - thread_kernel_globals_free(kg); - delete kg; - } - - virtual int get_split_task_count(DeviceTask &task) override - { - if (task.type == DeviceTask::SHADER) - return task.get_subtask_count(info.cpu_threads, 256); - else - return task.get_subtask_count(info.cpu_threads); - } - - virtual void task_add(DeviceTask &task) override - { - /* Load texture info. */ - load_texture_info(); - - /* split task into smaller ones */ - list<DeviceTask> tasks; - - if (task.type == DeviceTask::DENOISE_BUFFER && - task.denoising.type == DENOISER_OPENIMAGEDENOISE) { - /* Denoise entire buffer at once with OIDN, it has own threading. */ - tasks.push_back(task); - } - else if (task.type == DeviceTask::SHADER) { - task.split(tasks, info.cpu_threads, 256); - } - else { - task.split(tasks, info.cpu_threads); - } - - foreach (DeviceTask &task, tasks) { - task_pool.push([=] { - DeviceTask task_copy = task; - thread_run(task_copy); - }); - } - } - - virtual void task_wait() override - { - task_pool.wait_work(); - } - - virtual void task_cancel() override - { - task_pool.cancel(); - } - - protected: - inline KernelGlobals thread_kernel_globals_init() - { - KernelGlobals kg = kernel_globals; - kg.transparent_shadow_intersections = NULL; - const int decoupled_count = sizeof(kg.decoupled_volume_steps) / - sizeof(*kg.decoupled_volume_steps); - for (int i = 0; i < decoupled_count; ++i) { - kg.decoupled_volume_steps[i] = NULL; - } - kg.decoupled_volume_steps_index = 0; - kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL; -#ifdef WITH_OSL - OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); -#endif - return kg; - } - - inline void thread_kernel_globals_free(KernelGlobals *kg) - { - if (kg == NULL) { - return; - } - - if (kg->transparent_shadow_intersections != NULL) { - free(kg->transparent_shadow_intersections); - } - const int decoupled_count = sizeof(kg->decoupled_volume_steps) / - sizeof(*kg->decoupled_volume_steps); - for (int i = 0; i < decoupled_count; ++i) { - if (kg->decoupled_volume_steps[i] != NULL) { - free(kg->decoupled_volume_steps[i]); - } - } -#ifdef WITH_OSL - OSLShader::thread_free(kg); -#endif - } - - virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override - { - requested_features = requested_features_; - - return true; - } -}; - -/* split kernel */ - -class CPUSplitKernelFunction : public SplitKernelFunction { - public: - CPUDevice *device; - void (*func)(KernelGlobals *kg, KernelData *data); - - CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL) - { - } - ~CPUSplitKernelFunction() - { - } - - virtual bool enqueue(const KernelDimensions &dim, - device_memory &kernel_globals, - device_memory &data) - { - if (!func) { - return false; - } - - KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; - kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); - - for (int y = 0; y < dim.global_size[1]; y++) { - for (int x = 0; x < dim.global_size[0]; x++) { - kg->global_id = make_int2(x, y); - - func(kg, (KernelData *)data.device_pointer); - } - } - - return true; - } -}; - -CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device) -{ -} - -bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &data, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flags, - device_memory &work_pool_wgs) -{ - KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; - kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); - - for (int y = 0; y < dim.global_size[1]; y++) { - for (int x = 0; x < dim.global_size[0]; x++) { - kg->global_id = make_int2(x, y); - - device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer, - (KernelData *)data.device_pointer, - (void *)split_data.device_pointer, - num_global_elements, - (char *)ray_state.device_pointer, - rtile.start_sample, - rtile.start_sample + rtile.num_samples, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - (int *)queue_index.device_pointer, - dim.global_size[0] * dim.global_size[1], - (char *)use_queues_flags.device_pointer, - (uint *)work_pool_wgs.device_pointer, - rtile.num_samples, - (float *)rtile.buffer); - } - } - - return true; -} - -SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &) -{ - CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); - - kernel->func = device->split_kernels[kernel_name](); - if (!kernel->func) { - delete kernel; - return NULL; - } - - return kernel; -} - -int2 CPUSplitKernel::split_kernel_local_size() -{ - return make_int2(1, 1); -} - -int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/, - device_memory & /*data*/, - DeviceTask & /*task*/) -{ - return make_int2(1, 1); -} - -uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals, - device_memory & /*data*/, - size_t num_threads) -{ - KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; - - return split_data_buffer_size(kg, num_threads); -} - -Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) -{ - return new CPUDevice(info, stats, profiler, background); -} - -void device_cpu_info(vector<DeviceInfo> &devices) -{ - DeviceInfo info; - - info.type = DEVICE_CPU; - info.description = system_cpu_brand_string(); - info.id = "CPU"; - info.num = 0; - info.has_volume_decoupled = true; - info.has_adaptive_stop_per_sample = true; - info.has_osl = true; - info.has_half_images = true; - info.has_nanovdb = true; - info.has_profiling = true; - info.denoisers = DENOISER_NLM; - if (openimagedenoise_supported()) { - info.denoisers |= DENOISER_OPENIMAGEDENOISE; - } - - devices.insert(devices.begin(), info); -} - -string device_cpu_capabilities() -{ - string capabilities = ""; - capabilities += system_cpu_support_sse2() ? "SSE2 " : ""; - capabilities += system_cpu_support_sse3() ? "SSE3 " : ""; - capabilities += system_cpu_support_sse41() ? "SSE41 " : ""; - capabilities += system_cpu_support_avx() ? "AVX " : ""; - capabilities += system_cpu_support_avx2() ? "AVX2" : ""; - if (capabilities[capabilities.size() - 1] == ' ') - capabilities.resize(capabilities.size() - 1); - return capabilities; -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoise.cpp b/intern/cycles/device/device_denoise.cpp new file mode 100644 index 00000000000..aea7868f65d --- /dev/null +++ b/intern/cycles/device/device_denoise.cpp @@ -0,0 +1,88 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_denoise.h" + +CCL_NAMESPACE_BEGIN + +const char *denoiserTypeToHumanReadable(DenoiserType type) +{ + switch (type) { + case DENOISER_OPTIX: + return "OptiX"; + case DENOISER_OPENIMAGEDENOISE: + return "OpenImageDenoise"; + + case DENOISER_NUM: + case DENOISER_NONE: + case DENOISER_ALL: + return "UNKNOWN"; + } + + return "UNKNOWN"; +} + +const NodeEnum *DenoiseParams::get_type_enum() +{ + static NodeEnum type_enum; + + if (type_enum.empty()) { + type_enum.insert("optix", DENOISER_OPTIX); + type_enum.insert("openimageio", DENOISER_OPENIMAGEDENOISE); + } + + return &type_enum; +} + +const NodeEnum *DenoiseParams::get_prefilter_enum() +{ + static NodeEnum prefilter_enum; + + if (prefilter_enum.empty()) { + prefilter_enum.insert("none", DENOISER_PREFILTER_NONE); + prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST); + prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE); + } + + return &prefilter_enum; +} + +NODE_DEFINE(DenoiseParams) +{ + NodeType *type = NodeType::add("denoise_params", create); + + const NodeEnum *type_enum = get_type_enum(); + const NodeEnum *prefilter_enum = get_prefilter_enum(); + + SOCKET_BOOLEAN(use, "Use", false); + + SOCKET_ENUM(type, "Type", *type_enum, DENOISER_OPENIMAGEDENOISE); + + SOCKET_INT(start_sample, "Start Sample", 0); + + SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true); + SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false); + + SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST); + + return type; +} + +DenoiseParams::DenoiseParams() : Node(get_node_type()) +{ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoise.h b/intern/cycles/device/device_denoise.h new file mode 100644 index 00000000000..dfdc7cc87b3 --- /dev/null +++ b/intern/cycles/device/device_denoise.h @@ -0,0 +1,110 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/device_memory.h" +#include "graph/node.h" +#include "render/buffers.h" + +CCL_NAMESPACE_BEGIN + +enum DenoiserType { + DENOISER_OPTIX = 2, + DENOISER_OPENIMAGEDENOISE = 4, + DENOISER_NUM, + + DENOISER_NONE = 0, + DENOISER_ALL = ~0, +}; + +/* COnstruct human-readable string which denotes the denoiser type. */ +const char *denoiserTypeToHumanReadable(DenoiserType type); + +typedef int DenoiserTypeMask; + +enum DenoiserPrefilter { + /* Best quality of the result without extra processing time, but requires guiding passes to be + * noise-free. */ + DENOISER_PREFILTER_NONE = 1, + + /* Denoise color and guiding passes together. + * Improves quality when guiding passes are noisy using least amount of extra processing time. */ + DENOISER_PREFILTER_FAST = 2, + + /* Prefilter noisy guiding passes before denoising color. + * Improves quality when guiding passes are noisy using extra processing time. */ + DENOISER_PREFILTER_ACCURATE = 3, + + DENOISER_PREFILTER_NUM, +}; + +/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. + * The default values here do not really matter as they are always initialized from the + * Integrator node. */ +class DenoiseParams : public Node { + public: + NODE_DECLARE + + /* Apply denoiser to image. */ + bool use = false; + + /* Denoiser type. */ + DenoiserType type = DENOISER_OPENIMAGEDENOISE; + + /* Viewport start sample. */ + int start_sample = 0; + + /* Auxiliary passes. */ + bool use_pass_albedo = true; + bool use_pass_normal = true; + + DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST; + + static const NodeEnum *get_type_enum(); + static const NodeEnum *get_prefilter_enum(); + + DenoiseParams(); + + bool modified(const DenoiseParams &other) const + { + return !(use == other.use && type == other.type && start_sample == other.start_sample && + use_pass_albedo == other.use_pass_albedo && + use_pass_normal == other.use_pass_normal && prefilter == other.prefilter); + } +}; + +/* All the parameters needed to perform buffer denoising on a device. + * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is + * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a + * single place where they are all listed, so that it's not required to modify all device methods + * when these parameters do change. */ +class DeviceDenoiseTask { + public: + DenoiseParams params; + + int num_samples; + + RenderBuffers *render_buffers; + BufferParams buffer_params; + + /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will + * lower the memory footprint of the denoiser but will make input passes "invalid" (from path + * tracer) point of view. */ + bool allow_inplace_modification; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp deleted file mode 100644 index 38c42d15cab..00000000000 --- a/intern/cycles/device/device_denoising.cpp +++ /dev/null @@ -1,353 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "device/device_denoising.h" - -#include "kernel/filter/filter_defines.h" - -CCL_NAMESPACE_BEGIN - -DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) - : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE), - profiler(NULL), - storage(device), - buffer(device), - device(device) -{ - radius = task.denoising.radius; - nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength)); - if (task.denoising.relative_pca) { - pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength)); - } - else { - pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength)); - } - - render_buffer.frame_stride = task.frame_stride; - render_buffer.pass_stride = task.pass_stride; - render_buffer.offset = task.pass_denoising_data; - - target_buffer.pass_stride = task.target_pass_stride; - target_buffer.denoising_clean_offset = task.pass_denoising_clean; - target_buffer.offset = 0; - - functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device); - functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device); - - tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int)); - tile_info->from_render = task.denoising_from_render ? 1 : 0; - - tile_info->frames[0] = 0; - tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES); - for (int i = 1; i < tile_info->num_frames; i++) { - tile_info->frames[i] = task.denoising_frames[i - 1]; - } - - do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM; - do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM; -} - -DenoisingTask::~DenoisingTask() -{ - storage.XtWX.free(); - storage.XtWY.free(); - storage.transform.free(); - storage.rank.free(); - buffer.mem.free(); - buffer.temporary_mem.free(); - tile_info_mem.free(); -} - -void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors) -{ - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - RenderTile &rtile = neighbors.tiles[i]; - tile_info->offsets[i] = rtile.offset; - tile_info->strides[i] = rtile.stride; - tile_info->buffers[i] = rtile.buffer; - } - tile_info->x[0] = neighbors.tiles[3].x; - tile_info->x[1] = neighbors.tiles[4].x; - tile_info->x[2] = neighbors.tiles[5].x; - tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w; - tile_info->y[0] = neighbors.tiles[1].y; - tile_info->y[1] = neighbors.tiles[4].y; - tile_info->y[2] = neighbors.tiles[7].y; - tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h; - - target_buffer.offset = neighbors.target.offset; - target_buffer.stride = neighbors.target.stride; - target_buffer.ptr = neighbors.target.buffer; - - if (do_prefilter && neighbors.target.buffers) { - target_buffer.denoising_output_offset = - neighbors.target.buffers->params.get_denoising_prefiltered_offset(); - } - else { - target_buffer.denoising_output_offset = 0; - } - - tile_info_mem.copy_to_device(); -} - -void DenoisingTask::setup_denoising_buffer() -{ - /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring - * tiles */ - rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w); - rect = rect_expand(rect, radius); - rect = rect_clip(rect, - make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - - buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1); - buffer.passes = buffer.use_intensity ? 15 : 14; - buffer.width = rect.z - rect.x; - buffer.stride = align_up(buffer.width, 4); - buffer.h = rect.w - rect.y; - int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); - buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); - buffer.frame_stride = buffer.pass_stride * buffer.passes; - /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ - int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats); - buffer.mem.alloc_to_device(mem_size, false); - buffer.use_time = (tile_info->num_frames > 1); - - /* CPUs process shifts sequentially while GPUs process them in parallel. */ - int num_layers; - if (buffer.gpu_temporary_mem) { - /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */ - int max_radius = max(radius, 6); - int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1); - num_layers = 2 * num_shifts + 1; - } - else { - num_layers = 3; - } - /* Allocate two layers per shift as well as one for the weight accumulation. */ - buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride); -} - -void DenoisingTask::prefilter_shadowing() -{ - device_ptr null_ptr = (device_ptr)0; - - device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride); - device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride); - - /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the - * sample variance and the buffer variance. */ - functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); - - /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the - * sample variance. */ - nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false); - functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); - - /* Reuse memory, the previous data isn't needed anymore. */ - device_ptr filtered_a = *buffer_var, filtered_b = *sample_var; - /* Use the smoothed variance to filter the two shadow half images using each other for weight - * calculation. */ - nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false); - functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); - functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); - - device_ptr residual_var = *sample_var_var; - /* Estimate the residual variance between the two filtered halves. */ - functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect); - - device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b; - /* Use the residual variance for a second filter pass. */ - nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false); - functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); - functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); - - /* Combine the two double-filtered halves to a final shadow feature. */ - device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride); - functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect); -} - -void DenoisingTask::prefilter_features() -{ - device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride); - - int mean_from[] = {0, 1, 2, 12, 6, 7, 8}; - int variance_from[] = {3, 4, 5, 13, 9, 10, 11}; - int pass_to[] = {1, 2, 3, 0, 5, 6, 7}; - for (int pass = 0; pass < 7; pass++) { - device_sub_ptr feature_pass( - buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride); - /* Get the unfiltered pass and its variance from the RenderBuffers. */ - functions.get_feature(mean_from[pass], - variance_from[pass], - *unfiltered, - *variance, - 1.0f / render_buffer.samples); - /* Smooth the pass and store the result in the denoising buffers. */ - nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false); - functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); - } -} - -void DenoisingTask::prefilter_color() -{ - int mean_from[] = {20, 21, 22}; - int variance_from[] = {23, 24, 25}; - int mean_to[] = {8, 9, 10}; - int variance_to[] = {11, 12, 13}; - int num_color_passes = 3; - - device_only_memory<float> temporary_color(device, "denoising temporary color"); - temporary_color.alloc_to_device(6 * buffer.pass_stride, false); - - for (int pass = 0; pass < num_color_passes; pass++) { - device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr color_var_pass( - temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride); - functions.get_feature(mean_from[pass], - variance_from[pass], - *color_pass, - *color_var_pass, - 1.0f / render_buffer.samples); - } - - device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride); - device_sub_ptr color_var_pass( - buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride); - device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride); - functions.detect_outliers( - temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); - - if (buffer.use_intensity) { - device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride); - nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true); - functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass); - } -} - -void DenoisingTask::load_buffer() -{ - device_ptr null_ptr = (device_ptr)0; - - int original_offset = render_buffer.offset; - - int num_passes = buffer.use_intensity ? 15 : 14; - for (int i = 0; i < tile_info->num_frames; i++) { - for (int pass = 0; pass < num_passes; pass++) { - device_sub_ptr to_pass( - buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride); - bool is_variance = (pass >= 11) && (pass <= 13); - functions.get_feature( - pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f); - } - render_buffer.offset += render_buffer.frame_stride; - } - - render_buffer.offset = original_offset; -} - -void DenoisingTask::write_buffer() -{ - reconstruction_state.buffer_params = make_int4(target_buffer.offset, - target_buffer.stride, - target_buffer.pass_stride, - target_buffer.denoising_clean_offset); - int num_passes = buffer.use_intensity ? 15 : 14; - for (int pass = 0; pass < num_passes; pass++) { - device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride); - int out_offset = pass + target_buffer.denoising_output_offset; - functions.write_feature(out_offset, *from_pass, target_buffer.ptr); - } -} - -void DenoisingTask::construct_transform() -{ - storage.w = filter_area.z; - storage.h = filter_area.w; - - storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false); - storage.rank.alloc_to_device(storage.w * storage.h, false); - - functions.construct_transform(); -} - -void DenoisingTask::reconstruct() -{ - storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false); - storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false); - storage.XtWX.zero_to_device(); - storage.XtWY.zero_to_device(); - - reconstruction_state.filter_window = rect_from_shape( - filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h); - int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x; - reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset, - target_buffer.stride, - target_buffer.pass_stride, - target_buffer.denoising_clean_offset); - reconstruction_state.source_w = rect.z - rect.x; - reconstruction_state.source_h = rect.w - rect.y; - - device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride); - device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride); - for (int f = 0; f < tile_info->num_frames; f++) { - device_ptr scale_ptr = 0; - device_sub_ptr *scale_sub_ptr = NULL; - if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) { - scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride); - scale_ptr = **scale_sub_ptr; - } - - functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f); - delete scale_sub_ptr; - } - functions.solve(target_buffer.ptr); -} - -void DenoisingTask::run_denoising(RenderTile &tile) -{ - RenderTileNeighbors neighbors(tile); - functions.map_neighbor_tiles(neighbors); - set_render_buffer(neighbors); - - setup_denoising_buffer(); - - if (tile_info->from_render) { - prefilter_shadowing(); - prefilter_features(); - prefilter_color(); - } - else { - load_buffer(); - } - - if (do_filter) { - construct_transform(); - reconstruct(); - } - - if (do_prefilter) { - write_buffer(); - } - - functions.unmap_neighbor_tiles(neighbors); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h deleted file mode 100644 index bb8bdfdd225..00000000000 --- a/intern/cycles/device/device_denoising.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DEVICE_DENOISING_H__ -#define __DEVICE_DENOISING_H__ - -#include "device/device.h" - -#include "render/buffers.h" - -#include "kernel/filter/filter_defines.h" - -#include "util/util_profiling.h" - -CCL_NAMESPACE_BEGIN - -class DenoisingTask { - public: - /* Parameters of the denoising algorithm. */ - int radius; - float nlm_k_2; - float pca_threshold; - - /* Parameters of the RenderBuffers. */ - struct RenderBuffers { - int offset; - int pass_stride; - int frame_stride; - int samples; - } render_buffer; - - /* Pointer and parameters of the target buffer. */ - struct TargetBuffer { - int offset; - int stride; - int pass_stride; - int denoising_clean_offset; - int denoising_output_offset; - device_ptr ptr; - } target_buffer; - - TileInfo *tile_info; - device_vector<int> tile_info_mem; - - ProfilingState *profiler; - - int4 rect; - int4 filter_area; - - bool do_prefilter; - bool do_filter; - - struct DeviceFunctions { - function<bool( - device_ptr image_ptr, /* Contains the values that are smoothed. */ - device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */ - device_ptr variance_ptr, /* Contains the variance of the guide image. */ - device_ptr out_ptr /* The filtered output is written into this image. */ - )> - non_local_means; - function<bool( - device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)> - accumulate; - function<bool(device_ptr output_ptr)> solve; - function<bool()> construct_transform; - - function<bool(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect)> - combine_halves; - function<bool(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr)> - divide_shadow; - function<bool(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale)> - get_feature; - function<bool(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr)> - detect_outliers; - function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature; - function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles; - function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles; - } functions; - - /* Stores state of the current Reconstruction operation, - * which is accessed by the device in order to perform the operation. */ - struct ReconstructionState { - int4 filter_window; - int4 buffer_params; - - int source_w; - int source_h; - } reconstruction_state; - - /* Stores state of the current NLM operation, - * which is accessed by the device in order to perform the operation. */ - struct NLMState { - int r; /* Search radius of the filter. */ - int f; /* Patch size of the filter. */ - float a; /* Variance compensation factor in the MSE estimation. */ - float k_2; /* Squared value of the k parameter of the filter. */ - bool is_color; - - void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) - { - r = r_; - f = f_; - a = a_, k_2 = k_2_; - is_color = is_color_; - } - } nlm_state; - - struct Storage { - device_only_memory<float> transform; - device_only_memory<int> rank; - device_only_memory<float> XtWX; - device_only_memory<float3> XtWY; - int w; - int h; - - Storage(Device *device) - : transform(device, "denoising transform"), - rank(device, "denoising rank"), - XtWX(device, "denoising XtWX"), - XtWY(device, "denoising XtWY") - { - } - } storage; - - DenoisingTask(Device *device, const DeviceTask &task); - ~DenoisingTask(); - - void run_denoising(RenderTile &tile); - - struct DenoiseBuffers { - int pass_stride; - int passes; - int stride; - int h; - int width; - int frame_stride; - device_only_memory<float> mem; - device_only_memory<float> temporary_mem; - bool use_time; - bool use_intensity; - - bool gpu_temporary_mem; - - DenoiseBuffers(Device *device) - : mem(device, "denoising pixel buffer"), - temporary_mem(device, "denoising temporary mem", true) - { - } - } buffer; - - protected: - Device *device; - - void set_render_buffer(RenderTileNeighbors &neighbors); - void setup_denoising_buffer(); - void prefilter_shadowing(); - void prefilter_features(); - void prefilter_color(); - void construct_transform(); - void reconstruct(); - - void load_buffer(); - void write_buffer(); -}; - -CCL_NAMESPACE_END - -#endif /* __DEVICE_DENOISING_H__ */ diff --git a/intern/cycles/device/device_graphics_interop.cpp b/intern/cycles/device/device_graphics_interop.cpp new file mode 100644 index 00000000000..a80a236759f --- /dev/null +++ b/intern/cycles/device/device_graphics_interop.cpp @@ -0,0 +1,21 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_graphics_interop.h" + +CCL_NAMESPACE_BEGIN + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h new file mode 100644 index 00000000000..671b1c189d7 --- /dev/null +++ b/intern/cycles/device/device_graphics_interop.h @@ -0,0 +1,55 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +/* Information about interoperability destination. + * Is provided by the GPUDisplay. */ +class DeviceGraphicsInteropDestination { + public: + /* Dimensions of the buffer, in pixels. */ + int buffer_width = 0; + int buffer_height = 0; + + /* OpenGL pixel buffer object. */ + int opengl_pbo_id = 0; + + /* Clear the entire destination before doing partial write to it. */ + bool need_clear = false; +}; + +/* Device-side graphics interoperability support. + * + * Takes care of holding all the handlers needed by the device to implement interoperability with + * the graphics library. */ +class DeviceGraphicsInterop { + public: + DeviceGraphicsInterop() = default; + virtual ~DeviceGraphicsInterop() = default; + + /* Update this device-side graphics interoperability object with the given destination resource + * information. */ + virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0; + + virtual device_ptr map() = 0; + virtual void unmap() = 0; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h deleted file mode 100644 index ecc79c5d7ee..00000000000 --- a/intern/cycles/device/device_intern.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DEVICE_INTERN_H__ -#define __DEVICE_INTERN_H__ - -#include "util/util_string.h" -#include "util/util_vector.h" - -CCL_NAMESPACE_BEGIN - -class Device; -class DeviceInfo; -class Profiler; -class Stats; - -Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); -bool device_opencl_init(); -Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); -bool device_opencl_compile_kernel(const vector<string> ¶meters); -bool device_cuda_init(); -Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); -bool device_optix_init(); -Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); -Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); - -Device *device_network_create(DeviceInfo &info, - Stats &stats, - Profiler &profiler, - const char *address); -Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); - -void device_cpu_info(vector<DeviceInfo> &devices); -void device_opencl_info(vector<DeviceInfo> &devices); -void device_cuda_info(vector<DeviceInfo> &devices); -void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices); -void device_network_info(vector<DeviceInfo> &devices); - -string device_cpu_capabilities(); -string device_opencl_capabilities(); -string device_cuda_capabilities(); - -CCL_NAMESPACE_END - -#endif /* __DEVICE_INTERN_H__ */ diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp new file mode 100644 index 00000000000..ceaddee4756 --- /dev/null +++ b/intern/cycles/device/device_kernel.cpp @@ -0,0 +1,157 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_kernel.h" + +#include "util/util_logging.h" + +CCL_NAMESPACE_BEGIN + +const char *device_kernel_as_string(DeviceKernel kernel) +{ + switch (kernel) { + /* Integrator. */ + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: + return "integrator_init_from_camera"; + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: + return "integrator_init_from_bake"; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + return "integrator_intersect_closest"; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + return "integrator_intersect_shadow"; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + return "integrator_intersect_subsurface"; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: + return "integrator_intersect_volume_stack"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: + return "integrator_shade_background"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: + return "integrator_shade_light"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + return "integrator_shade_shadow"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: + return "integrator_shade_surface"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + return "integrator_shade_surface_raytrace"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: + return "integrator_shade_volume"; + case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL: + return "integrator_megakernel"; + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + return "integrator_queued_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + return "integrator_queued_shadow_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + return "integrator_active_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + return "integrator_terminated_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + return "integrator_sorted_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + return "integrator_compact_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: + return "integrator_compact_states"; + case DEVICE_KERNEL_INTEGRATOR_RESET: + return "integrator_reset"; + case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: + return "integrator_shadow_catcher_count_possible_splits"; + + /* Shader evaluation. */ + case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: + return "shader_eval_displace"; + case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: + return "shader_eval_background"; + + /* Film. */ + +#define FILM_CONVERT_KERNEL_AS_STRING(variant, variant_lowercase) \ + case DEVICE_KERNEL_FILM_CONVERT_##variant: \ + return "film_convert_" #variant_lowercase; \ + case DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA: \ + return "film_convert_" #variant_lowercase "_half_rgba"; + + FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth) + FILM_CONVERT_KERNEL_AS_STRING(MIST, mist) + FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count) + FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float) + FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path) + FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3) + FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion) + FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte) + FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER, shadow_catcher) + FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER_MATTE_WITH_SHADOW, + shadow_catcher_matte_with_shadow) + FILM_CONVERT_KERNEL_AS_STRING(COMBINED, combined) + FILM_CONVERT_KERNEL_AS_STRING(FLOAT4, float4) + +#undef FILM_CONVERT_KERNEL_AS_STRING + + /* Adaptive sampling. */ + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK: + return "adaptive_sampling_convergence_check"; + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X: + return "adaptive_sampling_filter_x"; + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y: + return "adaptive_sampling_filter_y"; + + /* Denoising. */ + case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS: + return "filter_guiding_preprocess"; + case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO: + return "filter_guiding_set_fake_albedo"; + case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS: + return "filter_color_preprocess"; + case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS: + return "filter_color_postprocess"; + + /* Cryptomatte. */ + case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: + return "cryptomatte_postprocess"; + + /* Generic */ + case DEVICE_KERNEL_PREFIX_SUM: + return "prefix_sum"; + + case DEVICE_KERNEL_NUM: + break; + }; + LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen."; + return "UNKNOWN"; +} + +std::ostream &operator<<(std::ostream &os, DeviceKernel kernel) +{ + os << device_kernel_as_string(kernel); + return os; +} + +string device_kernel_mask_as_string(DeviceKernelMask mask) +{ + string str; + + for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) { + if (mask & (uint64_t(1) << i)) { + if (!str.empty()) { + str += " "; + } + str += device_kernel_as_string((DeviceKernel)i); + } + } + + return str; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_kernel.h b/intern/cycles/device/device_kernel.h new file mode 100644 index 00000000000..83d959ca87b --- /dev/null +++ b/intern/cycles/device/device_kernel.h @@ -0,0 +1,33 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/kernel_types.h" + +#include "util/util_string.h" + +#include <ostream> // NOLINT + +CCL_NAMESPACE_BEGIN + +const char *device_kernel_as_string(DeviceKernel kernel); +std::ostream &operator<<(std::ostream &os, DeviceKernel kernel); + +typedef uint64_t DeviceKernelMask; +string device_kernel_mask_as_string(DeviceKernelMask mask); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp index 80a05fc32fe..c4d45829b83 100644 --- a/intern/cycles/device/device_memory.cpp +++ b/intern/cycles/device/device_memory.cpp @@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN device_memory::device_memory(Device *device, const char *name, MemoryType type) : data_type(device_type_traits<uchar>::data_type), - data_elements(device_type_traits<uchar>::num_elements), + data_elements(device_type_traits<uchar>::num_elements_cpu), data_size(0), device_size(0), data_width(0), @@ -149,6 +149,11 @@ void device_memory::device_zero() } } +bool device_memory::device_is_cpu() +{ + return (device->info.type == DEVICE_CPU); +} + void device_memory::swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr) diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 80f4d7b0468..c51594b8580 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -38,7 +38,6 @@ enum MemoryType { MEM_DEVICE_ONLY, MEM_GLOBAL, MEM_TEXTURE, - MEM_PIXELS }; /* Supported Data Types */ @@ -54,7 +53,7 @@ enum DataType { TYPE_UINT64, }; -static inline size_t datatype_size(DataType datatype) +static constexpr size_t datatype_size(DataType datatype) { switch (datatype) { case TYPE_UNKNOWN: @@ -82,112 +81,155 @@ static inline size_t datatype_size(DataType datatype) template<typename T> struct device_type_traits { static const DataType data_type = TYPE_UNKNOWN; - static const int num_elements = sizeof(T); + static const int num_elements_cpu = sizeof(T); + static const int num_elements_gpu = sizeof(T); }; template<> struct device_type_traits<uchar> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uchar2> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 2; + static const int num_elements_cpu = 2; + static const int num_elements_gpu = 2; + static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uchar3> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 3; + static const int num_elements_cpu = 3; + static const int num_elements_gpu = 3; + static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uchar4> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint> { static const DataType data_type = TYPE_UINT; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint2> { static const DataType data_type = TYPE_UINT; - static const int num_elements = 2; + static const int num_elements_cpu = 2; + static const int num_elements_gpu = 2; + static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint3> { static const DataType data_type = TYPE_UINT; - static const int num_elements = 3; + static const int num_elements_cpu = 3; + static const int num_elements_gpu = 3; + static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint4> { static const DataType data_type = TYPE_UINT; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int> { static const DataType data_type = TYPE_INT; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int2> { static const DataType data_type = TYPE_INT; - static const int num_elements = 2; + static const int num_elements_cpu = 2; + static const int num_elements_gpu = 2; + static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int3> { static const DataType data_type = TYPE_INT; - static const int num_elements = 3; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 3; + static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int4> { static const DataType data_type = TYPE_INT; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float2> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 2; + static const int num_elements_cpu = 2; + static const int num_elements_gpu = 2; + static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float3> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 3; + static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float4> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<half> { static const DataType data_type = TYPE_HALF; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<ushort4> { static const DataType data_type = TYPE_UINT16; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint16_t> { static const DataType data_type = TYPE_UINT16; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<half4> { static const DataType data_type = TYPE_HALF; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint64_t> { static const DataType data_type = TYPE_UINT64; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type)); }; /* Device Memory @@ -257,6 +299,8 @@ class device_memory { void device_copy_from(int y, int w, int h, int elem); void device_zero(); + bool device_is_cpu(); + device_ptr original_device_ptr; size_t original_device_size; Device *original_device; @@ -275,7 +319,9 @@ template<typename T> class device_only_memory : public device_memory { : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY) { data_type = device_type_traits<T>::data_type; - data_elements = max(device_type_traits<T>::num_elements, 1); + data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu : + device_type_traits<T>::num_elements_gpu, + 1); } device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other)) @@ -331,11 +377,15 @@ template<typename T> class device_only_memory : public device_memory { template<typename T> class device_vector : public device_memory { public: + /* Can only use this for types that have the same size on CPU and GPU. */ + static_assert(device_type_traits<T>::num_elements_cpu == + device_type_traits<T>::num_elements_gpu); + device_vector(Device *device, const char *name, MemoryType type) : device_memory(device, name, type) { data_type = device_type_traits<T>::data_type; - data_elements = device_type_traits<T>::num_elements; + data_elements = device_type_traits<T>::num_elements_cpu; modified = true; need_realloc_ = true; @@ -477,6 +527,11 @@ template<typename T> class device_vector : public device_memory { return (T *)host_pointer; } + const T *data() const + { + return (T *)host_pointer; + } + T &operator[](size_t i) { assert(i < data_size); @@ -507,7 +562,7 @@ template<typename T> class device_vector : public device_memory { void copy_from_device() { - device_copy_from(0, data_width, data_height, sizeof(T)); + device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T)); } void copy_from_device(int y, int w, int h) @@ -535,33 +590,6 @@ template<typename T> class device_vector : public device_memory { } }; -/* Pixel Memory - * - * Device memory to efficiently draw as pixels to the screen in interactive - * rendering. Only copying pixels from the device is supported, not copying to. */ - -template<typename T> class device_pixels : public device_vector<T> { - public: - device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS) - { - } - - void alloc_to_device(size_t width, size_t height, size_t depth = 0) - { - device_vector<T>::alloc(width, height, depth); - - if (!device_memory::device_pointer) { - device_memory::device_alloc(); - } - } - - T *copy_from_device(int y, int w, int h) - { - device_memory::device_copy_from(y, w, h, sizeof(T)); - return device_vector<T>::data(); - } -}; - /* Device Sub Memory * * Pointer into existing memory. It is not allocated separately, but created diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp deleted file mode 100644 index 85ffa5fcd52..00000000000 --- a/intern/cycles/device/device_multi.cpp +++ /dev/null @@ -1,826 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <sstream> -#include <stdlib.h> - -#include "bvh/bvh_multi.h" - -#include "device/device.h" -#include "device/device_intern.h" -#include "device/device_network.h" - -#include "render/buffers.h" -#include "render/geometry.h" - -#include "util/util_foreach.h" -#include "util/util_list.h" -#include "util/util_logging.h" -#include "util/util_map.h" -#include "util/util_time.h" - -CCL_NAMESPACE_BEGIN - -class MultiDevice : public Device { - public: - struct SubDevice { - Stats stats; - Device *device; - map<device_ptr, device_ptr> ptr_map; - int peer_island_index = -1; - }; - - list<SubDevice> devices, denoising_devices; - device_ptr unique_key; - vector<vector<SubDevice *>> peer_islands; - bool use_denoising; - bool matching_rendering_and_denoising_devices; - - MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) - : Device(info, stats, profiler, background_), - unique_key(1), - use_denoising(!info.denoising_devices.empty()) - { - foreach (DeviceInfo &subinfo, info.multi_devices) { - /* Always add CPU devices at the back since GPU devices can change - * host memory pointers, which CPU uses as device pointer. */ - SubDevice *sub; - if (subinfo.type == DEVICE_CPU) { - devices.emplace_back(); - sub = &devices.back(); - } - else { - devices.emplace_front(); - sub = &devices.front(); - } - - /* The pointer to 'sub->stats' will stay valid even after new devices - * are added, since 'devices' is a linked list. */ - sub->device = Device::create(subinfo, sub->stats, profiler, background); - } - - foreach (DeviceInfo &subinfo, info.denoising_devices) { - denoising_devices.emplace_front(); - SubDevice *sub = &denoising_devices.front(); - - sub->device = Device::create(subinfo, sub->stats, profiler, background); - } - - /* Build a list of peer islands for the available render devices */ - foreach (SubDevice &sub, devices) { - /* First ensure that every device is in at least once peer island */ - if (sub.peer_island_index < 0) { - peer_islands.emplace_back(); - sub.peer_island_index = (int)peer_islands.size() - 1; - peer_islands[sub.peer_island_index].push_back(&sub); - } - - if (!info.has_peer_memory) { - continue; - } - - /* Second check peer access between devices and fill up the islands accordingly */ - foreach (SubDevice &peer_sub, devices) { - if (peer_sub.peer_island_index < 0 && - peer_sub.device->info.type == sub.device->info.type && - peer_sub.device->check_peer_access(sub.device)) { - peer_sub.peer_island_index = sub.peer_island_index; - peer_islands[sub.peer_island_index].push_back(&peer_sub); - } - } - } - - /* Try to re-use memory when denoising and render devices use the same physical devices - * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU). - * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */ - matching_rendering_and_denoising_devices = denoising_devices.empty() || - (devices.size() == denoising_devices.size()); - if (matching_rendering_and_denoising_devices) { - for (list<SubDevice>::iterator device_it = devices.begin(), - denoising_device_it = denoising_devices.begin(); - device_it != devices.end() && denoising_device_it != denoising_devices.end(); - ++device_it, ++denoising_device_it) { - const DeviceInfo &info = device_it->device->info; - const DeviceInfo &denoising_info = denoising_device_it->device->info; - if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) || - (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) || - info.num != denoising_info.num) { - matching_rendering_and_denoising_devices = false; - break; - } - } - } - -#ifdef WITH_NETWORK - /* try to add network devices */ - ServerDiscovery discovery(true); - time_sleep(1.0); - - vector<string> servers = discovery.get_server_list(); - - foreach (string &server, servers) { - Device *device = device_network_create(info, stats, profiler, server.c_str()); - if (device) - devices.push_back(SubDevice(device)); - } -#endif - } - - ~MultiDevice() - { - foreach (SubDevice &sub, devices) - delete sub.device; - foreach (SubDevice &sub, denoising_devices) - delete sub.device; - } - - const string &error_message() override - { - error_msg.clear(); - - foreach (SubDevice &sub, devices) - error_msg += sub.device->error_message(); - foreach (SubDevice &sub, denoising_devices) - error_msg += sub.device->error_message(); - - return error_msg; - } - - virtual bool show_samples() const override - { - if (devices.size() > 1) { - return false; - } - return devices.front().device->show_samples(); - } - - virtual BVHLayoutMask get_bvh_layout_mask() const override - { - BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL; - BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE; - foreach (const SubDevice &sub_device, devices) { - BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask(); - bvh_layout_mask &= device_bvh_layout_mask; - bvh_layout_mask_all |= device_bvh_layout_mask; - } - - /* With multiple OptiX devices, every device needs its own acceleration structure */ - if (bvh_layout_mask == BVH_LAYOUT_OPTIX) { - return BVH_LAYOUT_MULTI_OPTIX; - } - - /* When devices do not share a common BVH layout, fall back to creating one for each */ - const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE); - if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) { - return BVH_LAYOUT_MULTI_OPTIX_EMBREE; - } - - return bvh_layout_mask; - } - - bool load_kernels(const DeviceRequestedFeatures &requested_features) override - { - foreach (SubDevice &sub, devices) - if (!sub.device->load_kernels(requested_features)) - return false; - - use_denoising = requested_features.use_denoising; - if (requested_features.use_denoising) { - /* Only need denoising feature, everything else is unused. */ - DeviceRequestedFeatures denoising_features; - denoising_features.use_denoising = true; - foreach (SubDevice &sub, denoising_devices) - if (!sub.device->load_kernels(denoising_features)) - return false; - } - - return true; - } - - bool wait_for_availability(const DeviceRequestedFeatures &requested_features) override - { - foreach (SubDevice &sub, devices) - if (!sub.device->wait_for_availability(requested_features)) - return false; - - if (requested_features.use_denoising) { - foreach (SubDevice &sub, denoising_devices) - if (!sub.device->wait_for_availability(requested_features)) - return false; - } - - return true; - } - - DeviceKernelStatus get_active_kernel_switch_state() override - { - DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL; - - foreach (SubDevice &sub, devices) { - DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state(); - switch (subresult) { - case DEVICE_KERNEL_FEATURE_KERNEL_INVALID: - case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE: - return subresult; - - case DEVICE_KERNEL_USING_FEATURE_KERNEL: - case DEVICE_KERNEL_UNKNOWN: - break; - } - } - - return result; - } - - void build_bvh(BVH *bvh, Progress &progress, bool refit) override - { - /* Try to build and share a single acceleration structure, if possible */ - if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) { - devices.back().device->build_bvh(bvh, progress, refit); - return; - } - - assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX || - bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE); - - BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh); - bvh_multi->sub_bvhs.resize(devices.size()); - - vector<BVHMulti *> geom_bvhs; - geom_bvhs.reserve(bvh->geometry.size()); - foreach (Geometry *geom, bvh->geometry) { - geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh)); - } - - /* Broadcast acceleration structure build to all render devices */ - size_t i = 0; - foreach (SubDevice &sub, devices) { - /* Change geometry BVH pointers to the sub BVH */ - for (size_t k = 0; k < bvh->geometry.size(); ++k) { - bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i]; - } - - if (!bvh_multi->sub_bvhs[i]) { - BVHParams params = bvh->params; - if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX) - params.bvh_layout = BVH_LAYOUT_OPTIX; - else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) - params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX : - BVH_LAYOUT_EMBREE; - - /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree - * (since they are put into the top level directly, see bvh_embree.cpp) */ - if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE && - !bvh->geometry[0]->is_instanced()) { - i++; - continue; - } - - bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device); - } - - sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit); - i++; - } - - /* Change geometry BVH pointers back to the multi BVH. */ - for (size_t k = 0; k < bvh->geometry.size(); ++k) { - bvh->geometry[k]->bvh = geom_bvhs[k]; - } - } - - virtual void *osl_memory() override - { - if (devices.size() > 1) { - return NULL; - } - return devices.front().device->osl_memory(); - } - - bool is_resident(device_ptr key, Device *sub_device) override - { - foreach (SubDevice &sub, devices) { - if (sub.device == sub_device) { - return find_matching_mem_device(key, sub)->device == sub_device; - } - } - return false; - } - - SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub) - { - assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end())); - - /* Get the memory owner of this key (first try current device, then peer devices) */ - SubDevice *owner_sub = ⊂ - if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) { - foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) { - if (island_sub != owner_sub && - island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) { - owner_sub = island_sub; - } - } - } - return owner_sub; - } - - SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island) - { - assert(!island.empty()); - - /* Get the memory owner of this key or the device with the lowest memory usage when new */ - SubDevice *owner_sub = island.front(); - foreach (SubDevice *island_sub, island) { - if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) : - (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) { - owner_sub = island_sub; - } - } - return owner_sub; - } - - inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub) - { - return find_matching_mem_device(key, sub)->ptr_map[key]; - } - - void mem_alloc(device_memory &mem) override - { - device_ptr key = unique_key++; - - if (mem.type == MEM_PIXELS) { - /* Always allocate pixels memory on all devices - * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */ - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = 0; - mem.device_size = 0; - - sub.device->mem_alloc(mem); - sub.ptr_map[key] = mem.device_pointer; - } - } - else { - assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || - mem.type == MEM_DEVICE_ONLY); - /* The remaining memory types can be distributed across devices */ - foreach (const vector<SubDevice *> &island, peer_islands) { - SubDevice *owner_sub = find_suitable_mem_device(key, island); - mem.device = owner_sub->device; - mem.device_pointer = 0; - mem.device_size = 0; - - owner_sub->device->mem_alloc(mem); - owner_sub->ptr_map[key] = mem.device_pointer; - } - } - - mem.device = this; - mem.device_pointer = key; - stats.mem_alloc(mem.device_size); - } - - void mem_copy_to(device_memory &mem) override - { - device_ptr existing_key = mem.device_pointer; - device_ptr key = (existing_key) ? existing_key : unique_key++; - size_t existing_size = mem.device_size; - - /* The tile buffers are allocated on each device (see below), so copy to all of them */ - if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) { - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - sub.device->mem_copy_to(mem); - sub.ptr_map[key] = mem.device_pointer; - } - } - else { - foreach (const vector<SubDevice *> &island, peer_islands) { - SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); - mem.device = owner_sub->device; - mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - owner_sub->device->mem_copy_to(mem); - owner_sub->ptr_map[key] = mem.device_pointer; - - if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) { - /* Need to create texture objects and update pointer in kernel globals on all devices */ - foreach (SubDevice *island_sub, island) { - if (island_sub != owner_sub) { - island_sub->device->mem_copy_to(mem); - } - } - } - } - } - - mem.device = this; - mem.device_pointer = key; - stats.mem_alloc(mem.device_size - existing_size); - } - - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override - { - device_ptr key = mem.device_pointer; - int i = 0, sub_h = h / devices.size(); - - foreach (SubDevice &sub, devices) { - int sy = y + i * sub_h; - int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h; - - SubDevice *owner_sub = find_matching_mem_device(key, sub); - mem.device = owner_sub->device; - mem.device_pointer = owner_sub->ptr_map[key]; - - owner_sub->device->mem_copy_from(mem, sy, w, sh, elem); - i++; - } - - mem.device = this; - mem.device_pointer = key; - } - - void mem_zero(device_memory &mem) override - { - device_ptr existing_key = mem.device_pointer; - device_ptr key = (existing_key) ? existing_key : unique_key++; - size_t existing_size = mem.device_size; - - /* This is a hack to only allocate the tile buffers on denoising devices - * Similarly the tile buffers also need to be allocated separately on all devices so any - * overlap rendered for denoising does not interfere with each other */ - if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) { - vector<device_ptr> device_pointers; - device_pointers.reserve(devices.size()); - - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - sub.device->mem_zero(mem); - sub.ptr_map[key] = mem.device_pointer; - - device_pointers.push_back(mem.device_pointer); - } - foreach (SubDevice &sub, denoising_devices) { - if (matching_rendering_and_denoising_devices) { - sub.ptr_map[key] = device_pointers.front(); - device_pointers.erase(device_pointers.begin()); - } - else { - mem.device = sub.device; - mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - sub.device->mem_zero(mem); - sub.ptr_map[key] = mem.device_pointer; - } - } - } - else { - foreach (const vector<SubDevice *> &island, peer_islands) { - SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); - mem.device = owner_sub->device; - mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - owner_sub->device->mem_zero(mem); - owner_sub->ptr_map[key] = mem.device_pointer; - } - } - - mem.device = this; - mem.device_pointer = key; - stats.mem_alloc(mem.device_size - existing_size); - } - - void mem_free(device_memory &mem) override - { - device_ptr key = mem.device_pointer; - size_t existing_size = mem.device_size; - - /* Free memory that was allocated for all devices (see above) on each device */ - if (mem.type == MEM_PIXELS || (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising)) { - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = sub.ptr_map[key]; - mem.device_size = existing_size; - - sub.device->mem_free(mem); - sub.ptr_map.erase(sub.ptr_map.find(key)); - } - foreach (SubDevice &sub, denoising_devices) { - if (matching_rendering_and_denoising_devices) { - sub.ptr_map.erase(key); - } - else { - mem.device = sub.device; - mem.device_pointer = sub.ptr_map[key]; - mem.device_size = existing_size; - - sub.device->mem_free(mem); - sub.ptr_map.erase(sub.ptr_map.find(key)); - } - } - } - else { - foreach (const vector<SubDevice *> &island, peer_islands) { - SubDevice *owner_sub = find_matching_mem_device(key, *island.front()); - mem.device = owner_sub->device; - mem.device_pointer = owner_sub->ptr_map[key]; - mem.device_size = existing_size; - - owner_sub->device->mem_free(mem); - owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key)); - - if (mem.type == MEM_TEXTURE) { - /* Free texture objects on all devices */ - foreach (SubDevice *island_sub, island) { - if (island_sub != owner_sub) { - island_sub->device->mem_free(mem); - } - } - } - } - } - - mem.device = this; - mem.device_pointer = 0; - mem.device_size = 0; - stats.mem_free(existing_size); - } - - void const_copy_to(const char *name, void *host, size_t size) override - { - foreach (SubDevice &sub, devices) - sub.device->const_copy_to(name, host, size); - } - - void draw_pixels(device_memory &rgba, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params) override - { - assert(rgba.type == MEM_PIXELS); - - device_ptr key = rgba.device_pointer; - int i = 0, sub_h = h / devices.size(); - int sub_height = height / devices.size(); - - foreach (SubDevice &sub, devices) { - int sy = y + i * sub_h; - int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h; - int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height; - int sdy = dy + i * sub_height; - /* adjust math for w/width */ - - rgba.device_pointer = sub.ptr_map[key]; - sub.device->draw_pixels( - rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params); - i++; - } - - rgba.device_pointer = key; - } - - void map_tile(Device *sub_device, RenderTile &tile) override - { - if (!tile.buffer) { - return; - } - - foreach (SubDevice &sub, devices) { - if (sub.device == sub_device) { - tile.buffer = find_matching_mem(tile.buffer, sub); - return; - } - } - - foreach (SubDevice &sub, denoising_devices) { - if (sub.device == sub_device) { - tile.buffer = sub.ptr_map[tile.buffer]; - return; - } - } - } - - int device_number(Device *sub_device) override - { - int i = 0; - - foreach (SubDevice &sub, devices) { - if (sub.device == sub_device) - return i; - i++; - } - - foreach (SubDevice &sub, denoising_devices) { - if (sub.device == sub_device) - return i; - i++; - } - - return -1; - } - - void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override - { - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - RenderTile &tile = neighbors.tiles[i]; - - if (!tile.buffers) { - continue; - } - - device_vector<float> &mem = tile.buffers->buffer; - tile.buffer = mem.device_pointer; - - if (mem.device == this && matching_rendering_and_denoising_devices) { - /* Skip unnecessary copies in viewport mode (buffer covers the - * whole image), but still need to fix up the tile device pointer. */ - map_tile(sub_device, tile); - continue; - } - - /* If the tile was rendered on another device, copy its memory to - * to the current device now, for the duration of the denoising task. - * Note that this temporarily modifies the RenderBuffers and calls - * the device, so this function is not thread safe. */ - if (mem.device != sub_device) { - /* Only copy from device to host once. This is faster, but - * also required for the case where a CPU thread is denoising - * a tile rendered on the GPU. In that case we have to avoid - * overwriting the buffer being de-noised by the CPU thread. */ - if (!tile.buffers->map_neighbor_copied) { - tile.buffers->map_neighbor_copied = true; - mem.copy_from_device(); - } - - if (mem.device == this) { - /* Can re-use memory if tile is already allocated on the sub device. */ - map_tile(sub_device, tile); - mem.swap_device(sub_device, mem.device_size, tile.buffer); - } - else { - mem.swap_device(sub_device, 0, 0); - } - - mem.copy_to_device(); - - tile.buffer = mem.device_pointer; - tile.device_size = mem.device_size; - - mem.restore_device(); - } - } - } - - void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override - { - RenderTile &target_tile = neighbors.target; - device_vector<float> &mem = target_tile.buffers->buffer; - - if (mem.device == this && matching_rendering_and_denoising_devices) { - return; - } - - /* Copy denoised result back to the host. */ - mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer); - mem.copy_from_device(); - mem.restore_device(); - - /* Copy denoised result to the original device. */ - mem.copy_to_device(); - - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - RenderTile &tile = neighbors.tiles[i]; - if (!tile.buffers) { - continue; - } - - device_vector<float> &mem = tile.buffers->buffer; - - if (mem.device != sub_device && mem.device != this) { - /* Free up memory again if it was allocated for the copy above. */ - mem.swap_device(sub_device, tile.device_size, tile.buffer); - sub_device->mem_free(mem); - mem.restore_device(); - } - } - } - - int get_split_task_count(DeviceTask &task) override - { - int total_tasks = 0; - list<DeviceTask> tasks; - task.split(tasks, devices.size()); - foreach (SubDevice &sub, devices) { - if (!tasks.empty()) { - DeviceTask subtask = tasks.front(); - tasks.pop_front(); - - total_tasks += sub.device->get_split_task_count(subtask); - } - } - return total_tasks; - } - - void task_add(DeviceTask &task) override - { - list<SubDevice> task_devices = devices; - if (!denoising_devices.empty()) { - if (task.type == DeviceTask::DENOISE_BUFFER) { - /* Denoising tasks should be redirected to the denoising devices entirely. */ - task_devices = denoising_devices; - } - else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) { - const uint tile_types = task.tile_types; - /* For normal rendering tasks only redirect the denoising part to the denoising devices. - * Do not need to split the task here, since they all run through 'acquire_tile'. */ - task.tile_types = RenderTile::DENOISE; - foreach (SubDevice &sub, denoising_devices) { - sub.device->task_add(task); - } - /* Rendering itself should still be executed on the rendering devices. */ - task.tile_types = tile_types ^ RenderTile::DENOISE; - } - } - - list<DeviceTask> tasks; - task.split(tasks, task_devices.size()); - - foreach (SubDevice &sub, task_devices) { - if (!tasks.empty()) { - DeviceTask subtask = tasks.front(); - tasks.pop_front(); - - if (task.buffer) - subtask.buffer = find_matching_mem(task.buffer, sub); - if (task.rgba_byte) - subtask.rgba_byte = sub.ptr_map[task.rgba_byte]; - if (task.rgba_half) - subtask.rgba_half = sub.ptr_map[task.rgba_half]; - if (task.shader_input) - subtask.shader_input = find_matching_mem(task.shader_input, sub); - if (task.shader_output) - subtask.shader_output = find_matching_mem(task.shader_output, sub); - - sub.device->task_add(subtask); - - if (task.buffers && task.buffers->buffer.device == this) { - /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */ - sub.device->task_wait(); - } - } - } - } - - void task_wait() override - { - foreach (SubDevice &sub, devices) - sub.device->task_wait(); - foreach (SubDevice &sub, denoising_devices) - sub.device->task_wait(); - } - - void task_cancel() override - { - foreach (SubDevice &sub, devices) - sub.device->task_cancel(); - foreach (SubDevice &sub, denoising_devices) - sub.device->task_cancel(); - } -}; - -Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) -{ - return new MultiDevice(info, stats, profiler, background); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp deleted file mode 100644 index 8904b517e92..00000000000 --- a/intern/cycles/device/device_network.cpp +++ /dev/null @@ -1,812 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "device/device_network.h" -#include "device/device.h" -#include "device/device_intern.h" - -#include "util/util_foreach.h" -#include "util/util_logging.h" - -#if defined(WITH_NETWORK) - -CCL_NAMESPACE_BEGIN - -typedef map<device_ptr, device_ptr> PtrMap; -typedef vector<uint8_t> DataVector; -typedef map<device_ptr, DataVector> DataMap; - -/* tile list */ -typedef vector<RenderTile> TileList; - -/* search a list of tiles and find the one that matches the passed render tile */ -static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile) -{ - for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it) - if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample) - return it; - return tile_list.end(); -} - -class NetworkDevice : public Device { - public: - boost::asio::io_service io_service; - tcp::socket socket; - device_ptr mem_counter; - DeviceTask the_task; /* todo: handle multiple tasks */ - - thread_mutex rpc_lock; - - virtual bool show_samples() const - { - return false; - } - - NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address) - : Device(info, stats, profiler, true), socket(io_service) - { - error_func = NetworkError(); - stringstream portstr; - portstr << SERVER_PORT; - - tcp::resolver resolver(io_service); - tcp::resolver::query query(address, portstr.str()); - tcp::resolver::iterator endpoint_iterator = resolver.resolve(query); - tcp::resolver::iterator end; - - boost::system::error_code error = boost::asio::error::host_not_found; - while (error && endpoint_iterator != end) { - socket.close(); - socket.connect(*endpoint_iterator++, error); - } - - if (error) - error_func.network_error(error.message()); - - mem_counter = 0; - } - - ~NetworkDevice() - { - RPCSend snd(socket, &error_func, "stop"); - snd.write(); - } - - virtual BVHLayoutMask get_bvh_layout_mask() const - { - return BVH_LAYOUT_BVH2; - } - - void mem_alloc(device_memory &mem) - { - if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - thread_scoped_lock lock(rpc_lock); - - mem.device_pointer = ++mem_counter; - - RPCSend snd(socket, &error_func, "mem_alloc"); - snd.add(mem); - snd.write(); - } - - void mem_copy_to(device_memory &mem) - { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "mem_copy_to"); - - snd.add(mem); - snd.write(); - snd.write_buffer(mem.host_pointer, mem.memory_size()); - } - - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) - { - thread_scoped_lock lock(rpc_lock); - - size_t data_size = mem.memory_size(); - - RPCSend snd(socket, &error_func, "mem_copy_from"); - - snd.add(mem); - snd.add(y); - snd.add(w); - snd.add(h); - snd.add(elem); - snd.write(); - - RPCReceive rcv(socket, &error_func); - rcv.read_buffer(mem.host_pointer, data_size); - } - - void mem_zero(device_memory &mem) - { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "mem_zero"); - - snd.add(mem); - snd.write(); - } - - void mem_free(device_memory &mem) - { - if (mem.device_pointer) { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "mem_free"); - - snd.add(mem); - snd.write(); - - mem.device_pointer = 0; - } - } - - void const_copy_to(const char *name, void *host, size_t size) - { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "const_copy_to"); - - string name_string(name); - - snd.add(name_string); - snd.add(size); - snd.write(); - snd.write_buffer(host, size); - } - - bool load_kernels(const DeviceRequestedFeatures &requested_features) - { - if (error_func.have_error()) - return false; - - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "load_kernels"); - snd.add(requested_features.experimental); - snd.add(requested_features.max_closure); - snd.add(requested_features.max_nodes_group); - snd.add(requested_features.nodes_features); - snd.write(); - - bool result; - RPCReceive rcv(socket, &error_func); - rcv.read(result); - - return result; - } - - void task_add(DeviceTask &task) - { - thread_scoped_lock lock(rpc_lock); - - the_task = task; - - RPCSend snd(socket, &error_func, "task_add"); - snd.add(task); - snd.write(); - } - - void task_wait() - { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "task_wait"); - snd.write(); - - lock.unlock(); - - TileList the_tiles; - - /* todo: run this threaded for connecting to multiple clients */ - for (;;) { - if (error_func.have_error()) - break; - - RenderTile tile; - - lock.lock(); - RPCReceive rcv(socket, &error_func); - - if (rcv.name == "acquire_tile") { - lock.unlock(); - - /* todo: watch out for recursive calls! */ - if (the_task.acquire_tile(this, tile)) { /* write return as bool */ - the_tiles.push_back(tile); - - lock.lock(); - RPCSend snd(socket, &error_func, "acquire_tile"); - snd.add(tile); - snd.write(); - lock.unlock(); - } - else { - lock.lock(); - RPCSend snd(socket, &error_func, "acquire_tile_none"); - snd.write(); - lock.unlock(); - } - } - else if (rcv.name == "release_tile") { - rcv.read(tile); - lock.unlock(); - - TileList::iterator it = tile_list_find(the_tiles, tile); - if (it != the_tiles.end()) { - tile.buffers = it->buffers; - the_tiles.erase(it); - } - - assert(tile.buffers != NULL); - - the_task.release_tile(tile); - - lock.lock(); - RPCSend snd(socket, &error_func, "release_tile"); - snd.write(); - lock.unlock(); - } - else if (rcv.name == "task_wait_done") { - lock.unlock(); - break; - } - else - lock.unlock(); - } - } - - void task_cancel() - { - thread_scoped_lock lock(rpc_lock); - RPCSend snd(socket, &error_func, "task_cancel"); - snd.write(); - } - - int get_split_task_count(DeviceTask &) - { - return 1; - } - - private: - NetworkError error_func; -}; - -Device *device_network_create(DeviceInfo &info, - Stats &stats, - Profiler &profiler, - const char *address) -{ - return new NetworkDevice(info, stats, profiler, address); -} - -void device_network_info(vector<DeviceInfo> &devices) -{ - DeviceInfo info; - - info.type = DEVICE_NETWORK; - info.description = "Network Device"; - info.id = "NETWORK"; - info.num = 0; - - /* todo: get this info from device */ - info.has_volume_decoupled = false; - info.has_adaptive_stop_per_sample = false; - info.has_osl = false; - info.denoisers = DENOISER_NONE; - - devices.push_back(info); -} - -class DeviceServer { - public: - thread_mutex rpc_lock; - - void network_error(const string &message) - { - error_func.network_error(message); - } - - bool have_error() - { - return error_func.have_error(); - } - - DeviceServer(Device *device_, tcp::socket &socket_) - : device(device_), socket(socket_), stop(false), blocked_waiting(false) - { - error_func = NetworkError(); - } - - void listen() - { - /* receive remote function calls */ - for (;;) { - listen_step(); - - if (stop) - break; - } - } - - protected: - void listen_step() - { - thread_scoped_lock lock(rpc_lock); - RPCReceive rcv(socket, &error_func); - - if (rcv.name == "stop") - stop = true; - else - process(rcv, lock); - } - - /* create a memory buffer for a device buffer and insert it into mem_data */ - DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size) - { - /* create a new DataVector and insert it into mem_data */ - pair<DataMap::iterator, bool> data_ins = mem_data.insert( - DataMap::value_type(client_pointer, DataVector())); - - /* make sure it was a unique insertion */ - assert(data_ins.second); - - /* get a reference to the inserted vector */ - DataVector &data_v = data_ins.first->second; - - /* size the vector */ - data_v.resize(data_size); - - return data_v; - } - - DataVector &data_vector_find(device_ptr client_pointer) - { - DataMap::iterator i = mem_data.find(client_pointer); - assert(i != mem_data.end()); - return i->second; - } - - /* setup mapping and reverse mapping of client_pointer<->real_pointer */ - void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer) - { - pair<PtrMap::iterator, bool> mapins; - - /* insert mapping from client pointer to our real device pointer */ - mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer)); - assert(mapins.second); - - /* insert reverse mapping from real our device pointer to client pointer */ - mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer)); - assert(mapins.second); - } - - device_ptr device_ptr_from_client_pointer(device_ptr client_pointer) - { - PtrMap::iterator i = ptr_map.find(client_pointer); - assert(i != ptr_map.end()); - return i->second; - } - - device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer) - { - PtrMap::iterator i = ptr_map.find(client_pointer); - assert(i != ptr_map.end()); - - device_ptr result = i->second; - - /* erase the mapping */ - ptr_map.erase(i); - - /* erase the reverse mapping */ - PtrMap::iterator irev = ptr_imap.find(result); - assert(irev != ptr_imap.end()); - ptr_imap.erase(irev); - - /* erase the data vector */ - DataMap::iterator idata = mem_data.find(client_pointer); - assert(idata != mem_data.end()); - mem_data.erase(idata); - - return result; - } - - /* note that the lock must be already acquired upon entry. - * This is necessary because the caller often peeks at - * the header and delegates control to here when it doesn't - * specifically handle the current RPC. - * The lock must be unlocked before returning */ - void process(RPCReceive &rcv, thread_scoped_lock &lock) - { - if (rcv.name == "mem_alloc") { - string name; - network_device_memory mem(device); - rcv.read(mem, name); - lock.unlock(); - - /* Allocate host side data buffer. */ - size_t data_size = mem.memory_size(); - device_ptr client_pointer = mem.device_pointer; - - DataVector &data_v = data_vector_insert(client_pointer, data_size); - mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0; - - /* Perform the allocation on the actual device. */ - device->mem_alloc(mem); - - /* Store a mapping to/from client_pointer and real device pointer. */ - pointer_mapping_insert(client_pointer, mem.device_pointer); - } - else if (rcv.name == "mem_copy_to") { - string name; - network_device_memory mem(device); - rcv.read(mem, name); - lock.unlock(); - - size_t data_size = mem.memory_size(); - device_ptr client_pointer = mem.device_pointer; - - if (client_pointer) { - /* Lookup existing host side data buffer. */ - DataVector &data_v = data_vector_find(client_pointer); - mem.host_pointer = (void *)&data_v[0]; - - /* Translate the client pointer to a real device pointer. */ - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); - } - else { - /* Allocate host side data buffer. */ - DataVector &data_v = data_vector_insert(client_pointer, data_size); - mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0; - } - - /* Copy data from network into memory buffer. */ - rcv.read_buffer((uint8_t *)mem.host_pointer, data_size); - - /* Copy the data from the memory buffer to the device buffer. */ - device->mem_copy_to(mem); - - if (!client_pointer) { - /* Store a mapping to/from client_pointer and real device pointer. */ - pointer_mapping_insert(client_pointer, mem.device_pointer); - } - } - else if (rcv.name == "mem_copy_from") { - string name; - network_device_memory mem(device); - int y, w, h, elem; - - rcv.read(mem, name); - rcv.read(y); - rcv.read(w); - rcv.read(h); - rcv.read(elem); - - device_ptr client_pointer = mem.device_pointer; - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); - - DataVector &data_v = data_vector_find(client_pointer); - - mem.host_pointer = (device_ptr) & (data_v[0]); - - device->mem_copy_from(mem, y, w, h, elem); - - size_t data_size = mem.memory_size(); - - RPCSend snd(socket, &error_func, "mem_copy_from"); - snd.write(); - snd.write_buffer((uint8_t *)mem.host_pointer, data_size); - lock.unlock(); - } - else if (rcv.name == "mem_zero") { - string name; - network_device_memory mem(device); - rcv.read(mem, name); - lock.unlock(); - - size_t data_size = mem.memory_size(); - device_ptr client_pointer = mem.device_pointer; - - if (client_pointer) { - /* Lookup existing host side data buffer. */ - DataVector &data_v = data_vector_find(client_pointer); - mem.host_pointer = (void *)&data_v[0]; - - /* Translate the client pointer to a real device pointer. */ - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); - } - else { - /* Allocate host side data buffer. */ - DataVector &data_v = data_vector_insert(client_pointer, data_size); - mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0; - } - - /* Zero memory. */ - device->mem_zero(mem); - - if (!client_pointer) { - /* Store a mapping to/from client_pointer and real device pointer. */ - pointer_mapping_insert(client_pointer, mem.device_pointer); - } - } - else if (rcv.name == "mem_free") { - string name; - network_device_memory mem(device); - - rcv.read(mem, name); - lock.unlock(); - - device_ptr client_pointer = mem.device_pointer; - - mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer); - - device->mem_free(mem); - } - else if (rcv.name == "const_copy_to") { - string name_string; - size_t size; - - rcv.read(name_string); - rcv.read(size); - - vector<char> host_vector(size); - rcv.read_buffer(&host_vector[0], size); - lock.unlock(); - - device->const_copy_to(name_string.c_str(), &host_vector[0], size); - } - else if (rcv.name == "load_kernels") { - DeviceRequestedFeatures requested_features; - rcv.read(requested_features.experimental); - rcv.read(requested_features.max_closure); - rcv.read(requested_features.max_nodes_group); - rcv.read(requested_features.nodes_features); - - bool result; - result = device->load_kernels(requested_features); - RPCSend snd(socket, &error_func, "load_kernels"); - snd.add(result); - snd.write(); - lock.unlock(); - } - else if (rcv.name == "task_add") { - DeviceTask task; - - rcv.read(task); - lock.unlock(); - - if (task.buffer) - task.buffer = device_ptr_from_client_pointer(task.buffer); - - if (task.rgba_half) - task.rgba_half = device_ptr_from_client_pointer(task.rgba_half); - - if (task.rgba_byte) - task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte); - - if (task.shader_input) - task.shader_input = device_ptr_from_client_pointer(task.shader_input); - - if (task.shader_output) - task.shader_output = device_ptr_from_client_pointer(task.shader_output); - - task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2); - task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1); - task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, - this); - task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1); - task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this); - - device->task_add(task); - } - else if (rcv.name == "task_wait") { - lock.unlock(); - - blocked_waiting = true; - device->task_wait(); - blocked_waiting = false; - - lock.lock(); - RPCSend snd(socket, &error_func, "task_wait_done"); - snd.write(); - lock.unlock(); - } - else if (rcv.name == "task_cancel") { - lock.unlock(); - device->task_cancel(); - } - else if (rcv.name == "acquire_tile") { - AcquireEntry entry; - entry.name = rcv.name; - rcv.read(entry.tile); - acquire_queue.push_back(entry); - lock.unlock(); - } - else if (rcv.name == "acquire_tile_none") { - AcquireEntry entry; - entry.name = rcv.name; - acquire_queue.push_back(entry); - lock.unlock(); - } - else if (rcv.name == "release_tile") { - AcquireEntry entry; - entry.name = rcv.name; - acquire_queue.push_back(entry); - lock.unlock(); - } - else { - cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n"; - lock.unlock(); - } - } - - bool task_acquire_tile(Device *, RenderTile &tile) - { - thread_scoped_lock acquire_lock(acquire_mutex); - - bool result = false; - - RPCSend snd(socket, &error_func, "acquire_tile"); - snd.write(); - - do { - if (blocked_waiting) - listen_step(); - - /* todo: avoid busy wait loop */ - thread_scoped_lock lock(rpc_lock); - - if (!acquire_queue.empty()) { - AcquireEntry entry = acquire_queue.front(); - acquire_queue.pop_front(); - - if (entry.name == "acquire_tile") { - tile = entry.tile; - - if (tile.buffer) - tile.buffer = ptr_map[tile.buffer]; - - result = true; - break; - } - else if (entry.name == "acquire_tile_none") { - break; - } - else { - cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n"; - } - } - } while (acquire_queue.empty() && !stop && !have_error()); - - return result; - } - - void task_update_progress_sample() - { - ; /* skip */ - } - - void task_update_tile_sample(RenderTile &) - { - ; /* skip */ - } - - void task_release_tile(RenderTile &tile) - { - thread_scoped_lock acquire_lock(acquire_mutex); - - if (tile.buffer) - tile.buffer = ptr_imap[tile.buffer]; - - { - thread_scoped_lock lock(rpc_lock); - RPCSend snd(socket, &error_func, "release_tile"); - snd.add(tile); - snd.write(); - lock.unlock(); - } - - do { - if (blocked_waiting) - listen_step(); - - /* todo: avoid busy wait loop */ - thread_scoped_lock lock(rpc_lock); - - if (!acquire_queue.empty()) { - AcquireEntry entry = acquire_queue.front(); - acquire_queue.pop_front(); - - if (entry.name == "release_tile") { - lock.unlock(); - break; - } - else { - cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n"; - } - } - } while (acquire_queue.empty() && !stop); - } - - bool task_get_cancel() - { - return false; - } - - /* properties */ - Device *device; - tcp::socket &socket; - - /* mapping of remote to local pointer */ - PtrMap ptr_map; - PtrMap ptr_imap; - DataMap mem_data; - - struct AcquireEntry { - string name; - RenderTile tile; - }; - - thread_mutex acquire_mutex; - list<AcquireEntry> acquire_queue; - - bool stop; - bool blocked_waiting; - - private: - NetworkError error_func; - - /* todo: free memory and device (osl) on network error */ -}; - -void Device::server_run() -{ - try { - /* starts thread that responds to discovery requests */ - ServerDiscovery discovery; - - for (;;) { - /* accept connection */ - boost::asio::io_service io_service; - tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT)); - - tcp::socket socket(io_service); - acceptor.accept(socket); - - string remote_address = socket.remote_endpoint().address().to_string(); - printf("Connected to remote client at: %s\n", remote_address.c_str()); - - DeviceServer server(this, socket); - server.listen(); - - printf("Disconnected.\n"); - } - } - catch (exception &e) { - fprintf(stderr, "Network server exception: %s\n", e.what()); - } -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h deleted file mode 100644 index b3a0f6daa57..00000000000 --- a/intern/cycles/device/device_network.h +++ /dev/null @@ -1,490 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DEVICE_NETWORK_H__ -#define __DEVICE_NETWORK_H__ - -#ifdef WITH_NETWORK - -# include <boost/archive/binary_iarchive.hpp> -# include <boost/archive/binary_oarchive.hpp> -# include <boost/archive/text_iarchive.hpp> -# include <boost/archive/text_oarchive.hpp> -# include <boost/array.hpp> -# include <boost/asio.hpp> -# include <boost/bind.hpp> -# include <boost/serialization/vector.hpp> -# include <boost/thread.hpp> - -# include <deque> -# include <iostream> -# include <sstream> - -# include "render/buffers.h" - -# include "util/util_foreach.h" -# include "util/util_list.h" -# include "util/util_map.h" -# include "util/util_param.h" -# include "util/util_string.h" - -CCL_NAMESPACE_BEGIN - -using std::cerr; -using std::cout; -using std::exception; -using std::hex; -using std::setw; - -using boost::asio::ip::tcp; - -static const int SERVER_PORT = 5120; -static const int DISCOVER_PORT = 5121; -static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP"; -static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP"; - -# if 0 -typedef boost::archive::text_oarchive o_archive; -typedef boost::archive::text_iarchive i_archive; -# else -typedef boost::archive::binary_oarchive o_archive; -typedef boost::archive::binary_iarchive i_archive; -# endif - -/* Serialization of device memory */ - -class network_device_memory : public device_memory { - public: - network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY) - { - } - - ~network_device_memory() - { - device_pointer = 0; - }; - - vector<char> local_data; -}; - -/* Common network error function / object for both DeviceNetwork and DeviceServer. */ -class NetworkError { - public: - NetworkError() - { - error = ""; - error_count = 0; - } - - ~NetworkError() - { - } - - void network_error(const string &message) - { - error = message; - error_count += 1; - } - - bool have_error() - { - return true ? error_count > 0 : false; - } - - private: - string error; - int error_count; -}; - -/* Remote procedure call Send */ - -class RPCSend { - public: - RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "") - : name(name_), socket(socket_), archive(archive_stream), sent(false) - { - archive &name_; - error_func = e; - fprintf(stderr, "rpc send %s\n", name.c_str()); - } - - ~RPCSend() - { - } - - void add(const device_memory &mem) - { - archive &mem.data_type &mem.data_elements &mem.data_size; - archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer; - archive &mem.type &string(mem.name); - archive &mem.interpolation &mem.extension; - archive &mem.device_pointer; - } - - template<typename T> void add(const T &data) - { - archive &data; - } - - void add(const DeviceTask &task) - { - int type = (int)task.type; - archive &type &task.x &task.y &task.w &task.h; - archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples; - archive &task.offset &task.stride; - archive &task.shader_input &task.shader_output &task.shader_eval_type; - archive &task.shader_x &task.shader_w; - archive &task.need_finish_queue; - } - - void add(const RenderTile &tile) - { - archive &tile.x &tile.y &tile.w &tile.h; - archive &tile.start_sample &tile.num_samples &tile.sample; - archive &tile.resolution &tile.offset &tile.stride; - archive &tile.buffer; - } - - void write() - { - boost::system::error_code error; - - /* get string from stream */ - string archive_str = archive_stream.str(); - - /* first send fixed size header with size of following data */ - ostringstream header_stream; - header_stream << setw(8) << hex << archive_str.size(); - string header_str = header_stream.str(); - - boost::asio::write( - socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error); - - if (error.value()) - error_func->network_error(error.message()); - - /* then send actual data */ - boost::asio::write( - socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error); - - if (error.value()) - error_func->network_error(error.message()); - - sent = true; - } - - void write_buffer(void *buffer, size_t size) - { - boost::system::error_code error; - - boost::asio::write( - socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error); - - if (error.value()) - error_func->network_error(error.message()); - } - - protected: - string name; - tcp::socket &socket; - ostringstream archive_stream; - o_archive archive; - bool sent; - NetworkError *error_func; -}; - -/* Remote procedure call Receive */ - -class RPCReceive { - public: - RPCReceive(tcp::socket &socket_, NetworkError *e) - : socket(socket_), archive_stream(NULL), archive(NULL) - { - error_func = e; - /* read head with fixed size */ - vector<char> header(8); - boost::system::error_code error; - size_t len = boost::asio::read(socket, boost::asio::buffer(header), error); - - if (error.value()) { - error_func->network_error(error.message()); - } - - /* verify if we got something */ - if (len == header.size()) { - /* decode header */ - string header_str(&header[0], header.size()); - istringstream header_stream(header_str); - - size_t data_size; - - if ((header_stream >> hex >> data_size)) { - - vector<char> data(data_size); - size_t len = boost::asio::read(socket, boost::asio::buffer(data), error); - - if (error.value()) - error_func->network_error(error.message()); - - if (len == data_size) { - archive_str = (data.size()) ? string(&data[0], data.size()) : string(""); - - archive_stream = new istringstream(archive_str); - archive = new i_archive(*archive_stream); - - *archive &name; - fprintf(stderr, "rpc receive %s\n", name.c_str()); - } - else { - error_func->network_error("Network receive error: data size doesn't match header"); - } - } - else { - error_func->network_error("Network receive error: can't decode data size from header"); - } - } - else { - error_func->network_error("Network receive error: invalid header size"); - } - } - - ~RPCReceive() - { - delete archive; - delete archive_stream; - } - - void read(network_device_memory &mem, string &name) - { - *archive &mem.data_type &mem.data_elements &mem.data_size; - *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer; - *archive &mem.type &name; - *archive &mem.interpolation &mem.extension; - *archive &mem.device_pointer; - - mem.name = name.c_str(); - mem.host_pointer = 0; - - /* Can't transfer OpenGL texture over network. */ - if (mem.type == MEM_PIXELS) { - mem.type = MEM_READ_WRITE; - } - } - - template<typename T> void read(T &data) - { - *archive &data; - } - - void read_buffer(void *buffer, size_t size) - { - boost::system::error_code error; - size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error); - - if (error.value()) { - error_func->network_error(error.message()); - } - - if (len != size) - cout << "Network receive error: buffer size doesn't match expected size\n"; - } - - void read(DeviceTask &task) - { - int type; - - *archive &type &task.x &task.y &task.w &task.h; - *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples; - *archive &task.offset &task.stride; - *archive &task.shader_input &task.shader_output &task.shader_eval_type; - *archive &task.shader_x &task.shader_w; - *archive &task.need_finish_queue; - - task.type = (DeviceTask::Type)type; - } - - void read(RenderTile &tile) - { - *archive &tile.x &tile.y &tile.w &tile.h; - *archive &tile.start_sample &tile.num_samples &tile.sample; - *archive &tile.resolution &tile.offset &tile.stride; - *archive &tile.buffer; - - tile.buffers = NULL; - } - - string name; - - protected: - tcp::socket &socket; - string archive_str; - istringstream *archive_stream; - i_archive *archive; - NetworkError *error_func; -}; - -/* Server auto discovery */ - -class ServerDiscovery { - public: - explicit ServerDiscovery(bool discover = false) - : listen_socket(io_service), collect_servers(false) - { - /* setup listen socket */ - listen_endpoint.address(boost::asio::ip::address_v4::any()); - listen_endpoint.port(DISCOVER_PORT); - - listen_socket.open(listen_endpoint.protocol()); - - boost::asio::socket_base::reuse_address option(true); - listen_socket.set_option(option); - - listen_socket.bind(listen_endpoint); - - /* setup receive callback */ - async_receive(); - - /* start server discovery */ - if (discover) { - collect_servers = true; - servers.clear(); - - broadcast_message(DISCOVER_REQUEST_MSG); - } - - /* start thread */ - work = new boost::asio::io_service::work(io_service); - thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service)); - } - - ~ServerDiscovery() - { - io_service.stop(); - thread->join(); - delete thread; - delete work; - } - - vector<string> get_server_list() - { - vector<string> result; - - mutex.lock(); - result = vector<string>(servers.begin(), servers.end()); - mutex.unlock(); - - return result; - } - - private: - void handle_receive_from(const boost::system::error_code &error, size_t size) - { - if (error) { - cout << "Server discovery receive error: " << error.message() << "\n"; - return; - } - - if (size > 0) { - string msg = string(receive_buffer, size); - - /* handle incoming message */ - if (collect_servers) { - if (msg == DISCOVER_REPLY_MSG) { - string address = receive_endpoint.address().to_string(); - - mutex.lock(); - - /* add address if it's not already in the list */ - bool found = std::find(servers.begin(), servers.end(), address) != servers.end(); - - if (!found) - servers.push_back(address); - - mutex.unlock(); - } - } - else { - /* reply to request */ - if (msg == DISCOVER_REQUEST_MSG) - broadcast_message(DISCOVER_REPLY_MSG); - } - } - - async_receive(); - } - - void async_receive() - { - listen_socket.async_receive_from(boost::asio::buffer(receive_buffer), - receive_endpoint, - boost::bind(&ServerDiscovery::handle_receive_from, - this, - boost::asio::placeholders::error, - boost::asio::placeholders::bytes_transferred)); - } - - void broadcast_message(const string &msg) - { - /* setup broadcast socket */ - boost::asio::ip::udp::socket socket(io_service); - - socket.open(boost::asio::ip::udp::v4()); - - boost::asio::socket_base::broadcast option(true); - socket.set_option(option); - - boost::asio::ip::udp::endpoint broadcast_endpoint( - boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT); - - /* broadcast message */ - socket.send_to(boost::asio::buffer(msg), broadcast_endpoint); - } - - /* network service and socket */ - boost::asio::io_service io_service; - boost::asio::ip::udp::endpoint listen_endpoint; - boost::asio::ip::udp::socket listen_socket; - - /* threading */ - boost::thread *thread; - boost::asio::io_service::work *work; - boost::mutex mutex; - - /* buffer and endpoint for receiving messages */ - char receive_buffer[256]; - boost::asio::ip::udp::endpoint receive_endpoint; - - // os, version, devices, status, host name, group name, ip as far as fields go - struct ServerInfo { - string cycles_version; - string os; - int device_count; - string status; - string host_name; - string group_name; - string host_addr; - }; - - /* collection of server addresses in list */ - bool collect_servers; - vector<string> servers; -}; - -CCL_NAMESPACE_END - -#endif - -#endif /* __DEVICE_NETWORK_H__ */ diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp deleted file mode 100644 index 9abb7cfb7fe..00000000000 --- a/intern/cycles/device/device_opencl.cpp +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -# include "device/opencl/device_opencl.h" -# include "device/device.h" -# include "device/device_intern.h" - -# include "util/util_foreach.h" -# include "util/util_logging.h" -# include "util/util_set.h" -# include "util/util_string.h" - -CCL_NAMESPACE_BEGIN - -Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) -{ - return opencl_create_split_device(info, stats, profiler, background); -} - -bool device_opencl_init() -{ - static bool initialized = false; - static bool result = false; - - if (initialized) - return result; - - initialized = true; - - if (OpenCLInfo::device_type() != 0) { - int clew_result = clewInit(); - if (clew_result == CLEW_SUCCESS) { - VLOG(1) << "CLEW initialization succeeded."; - result = true; - } - else { - VLOG(1) << "CLEW initialization failed: " - << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" : - "Error opening the library"); - } - } - else { - VLOG(1) << "Skip initializing CLEW, platform is force disabled."; - result = false; - } - - return result; -} - -static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms) -{ -# ifdef _WIN32 - __try { - return clGetPlatformIDs(0, NULL, num_platforms); - } - __except (EXCEPTION_EXECUTE_HANDLER) { - /* Ignore crashes inside the OpenCL driver and hope we can - * survive even with corrupted OpenCL installs. */ - fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n"); - } - - *num_platforms = 0; - return CL_DEVICE_NOT_FOUND; -# else - return clGetPlatformIDs(0, NULL, num_platforms); -# endif -} - -void device_opencl_info(vector<DeviceInfo> &devices) -{ - cl_uint num_platforms = 0; - device_opencl_get_num_platforms_safe(&num_platforms); - if (num_platforms == 0) { - return; - } - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - /* Devices are numbered consecutively across platforms. */ - int num_devices = 0; - set<string> unique_ids; - foreach (OpenCLPlatformDevice &platform_device, usable_devices) { - /* Compute unique ID for persistent user preferences. */ - const string &platform_name = platform_device.platform_name; - const string &device_name = platform_device.device_name; - string hardware_id = platform_device.hardware_id; - if (hardware_id == "") { - hardware_id = string_printf("ID_%d", num_devices); - } - string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id; - - /* Hardware ID might not be unique, add device number in that case. */ - if (unique_ids.find(id) != unique_ids.end()) { - id += string_printf("_ID_%d", num_devices); - } - unique_ids.insert(id); - - /* Create DeviceInfo. */ - DeviceInfo info; - info.type = DEVICE_OPENCL; - info.description = string_remove_trademark(string(device_name)); - info.num = num_devices; - /* We don't know if it's used for display, but assume it is. */ - info.display_device = true; - info.use_split_kernel = true; - info.has_volume_decoupled = false; - info.has_adaptive_stop_per_sample = false; - info.denoisers = DENOISER_NLM; - info.id = id; - - /* Check OpenCL extensions */ - info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos; - - /* Disabled for now due to apparent AMD driver bug. */ - info.has_nanovdb = platform_name != "AMD Accelerated Parallel Processing"; - - devices.push_back(info); - num_devices++; - } -} - -string device_opencl_capabilities() -{ - if (OpenCLInfo::device_type() == 0) { - return "All OpenCL devices are forced to be OFF"; - } - string result = ""; - string error_msg = ""; /* Only used by opencl_assert(), but in the future - * it could also be nicely reported to the console. - */ - cl_uint num_platforms = 0; - opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms)); - if (num_platforms == 0) { - return "No OpenCL platforms found\n"; - } - result += string_printf("Number of platforms: %u\n", num_platforms); - - vector<cl_platform_id> platform_ids; - platform_ids.resize(num_platforms); - opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL)); - -# define APPEND_INFO(func, id, name, what, type) \ - do { \ - type data; \ - memset(&data, 0, sizeof(data)); \ - opencl_assert(func(id, what, sizeof(data), &data, NULL)); \ - result += string_printf("%s: %s\n", name, to_string(data).c_str()); \ - } while (false) -# define APPEND_STRING_INFO_IMPL(func, id, name, what, is_optional) \ - do { \ - string value; \ - size_t length = 0; \ - if (func(id, what, 0, NULL, &length) == CL_SUCCESS) { \ - vector<char> buffer(length + 1); \ - if (func(id, what, buffer.size(), buffer.data(), NULL) == CL_SUCCESS) { \ - value = string(buffer.data()); \ - } \ - } \ - if (is_optional && !(length != 0 && value[0] != '\0')) { \ - break; \ - } \ - result += string_printf("%s: %s\n", name, value.c_str()); \ - } while (false) -# define APPEND_PLATFORM_STRING_INFO(id, name, what) \ - APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, false) -# define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \ - APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, true) -# define APPEND_PLATFORM_INFO(id, name, what, type) \ - APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type) -# define APPEND_DEVICE_INFO(id, name, what, type) \ - APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type) -# define APPEND_DEVICE_STRING_INFO(id, name, what) \ - APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, false) -# define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \ - APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, true) - - vector<cl_device_id> device_ids; - for (cl_uint platform = 0; platform < num_platforms; ++platform) { - cl_platform_id platform_id = platform_ids[platform]; - - result += string_printf("Platform #%u\n", platform); - - APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME); - APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR); - APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION); - APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE); - APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS); - - cl_uint num_devices = 0; - opencl_assert( - clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices)); - result += string_printf("\tNumber of devices: %u\n", num_devices); - - device_ids.resize(num_devices); - opencl_assert(clGetDeviceIDs( - platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL)); - for (cl_uint device = 0; device < num_devices; ++device) { - cl_device_id device_id = device_ids[device]; - - result += string_printf("\t\tDevice: #%u\n", device); - - APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME); - APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD); - APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR); - APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION); - APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE); - APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION); - APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS); - APPEND_DEVICE_INFO( - device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint); - APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint); - APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t); - } - } - -# undef APPEND_INFO -# undef APPEND_STRING_INFO_IMPL -# undef APPEND_PLATFORM_STRING_INFO -# undef APPEND_STRING_EXTENSION_INFO -# undef APPEND_PLATFORM_INFO -# undef APPEND_DEVICE_INFO -# undef APPEND_DEVICE_STRING_INFO -# undef APPEND_DEVICE_STRING_EXTENSION_INFO - - return result; -} - -CCL_NAMESPACE_END - -#endif /* WITH_OPENCL */ diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp deleted file mode 100644 index 6f9a7943722..00000000000 --- a/intern/cycles/device/device_optix.cpp +++ /dev/null @@ -1,1936 +0,0 @@ -/* - * Copyright 2019, NVIDIA Corporation. - * Copyright 2019, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPTIX - -# include "bvh/bvh.h" -# include "bvh/bvh_optix.h" -# include "device/cuda/device_cuda.h" -# include "device/device_denoising.h" -# include "device/device_intern.h" -# include "render/buffers.h" -# include "render/hair.h" -# include "render/mesh.h" -# include "render/object.h" -# include "render/scene.h" -# include "util/util_debug.h" -# include "util/util_logging.h" -# include "util/util_md5.h" -# include "util/util_path.h" -# include "util/util_progress.h" -# include "util/util_time.h" - -# ifdef WITH_CUDA_DYNLOAD -# include <cuew.h> -// Do not use CUDA SDK headers when using CUEW -# define OPTIX_DONT_INCLUDE_CUDA -# endif -# include <optix_function_table_definition.h> -# include <optix_stubs.h> - -// TODO(pmours): Disable this once drivers have native support -# define OPTIX_DENOISER_NO_PIXEL_STRIDE 1 - -CCL_NAMESPACE_BEGIN - -/* Make sure this stays in sync with kernel_globals.h */ -struct ShaderParams { - uint4 *input; - float4 *output; - int type; - int filter; - int sx; - int offset; - int sample; -}; -struct KernelParams { - WorkTile tile; - KernelData data; - ShaderParams shader; -# define KERNEL_TEX(type, name) const type *name; -# include "kernel/kernel_textures.h" -# undef KERNEL_TEX -}; - -# define check_result_cuda(stmt) \ - { \ - CUresult res = stmt; \ - if (res != CUDA_SUCCESS) { \ - const char *name; \ - cuGetErrorName(res, &name); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return; \ - } \ - } \ - (void)0 -# define check_result_cuda_ret(stmt) \ - { \ - CUresult res = stmt; \ - if (res != CUDA_SUCCESS) { \ - const char *name; \ - cuGetErrorName(res, &name); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return false; \ - } \ - } \ - (void)0 - -# define check_result_optix(stmt) \ - { \ - enum OptixResult res = stmt; \ - if (res != OPTIX_SUCCESS) { \ - const char *name = optixGetErrorName(res); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return; \ - } \ - } \ - (void)0 -# define check_result_optix_ret(stmt) \ - { \ - enum OptixResult res = stmt; \ - if (res != OPTIX_SUCCESS) { \ - const char *name = optixGetErrorName(res); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return false; \ - } \ - } \ - (void)0 - -# define launch_filter_kernel(func_name, w, h, args) \ - { \ - CUfunction func; \ - check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \ - check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \ - int threads; \ - check_result_cuda_ret( \ - cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - threads = (int)sqrt((float)threads); \ - int xblocks = ((w) + threads - 1) / threads; \ - int yblocks = ((h) + threads - 1) / threads; \ - check_result_cuda_ret( \ - cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \ - } \ - (void)0 - -class OptiXDevice : public CUDADevice { - - // List of OptiX program groups - enum { - PG_RGEN, - PG_MISS, - PG_HITD, // Default hit group - PG_HITS, // __SHADOW_RECORD_ALL__ hit group - PG_HITL, // __BVH_LOCAL__ hit group (only used for triangles) -# if OPTIX_ABI_VERSION >= 36 - PG_HITD_MOTION, - PG_HITS_MOTION, -# endif - PG_BAKE, // kernel_bake_evaluate - PG_DISP, // kernel_displace_evaluate - PG_BACK, // kernel_background_evaluate - PG_CALL, - NUM_PROGRAM_GROUPS = PG_CALL + 3 - }; - - // List of OptiX pipelines - enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES }; - - // A single shader binding table entry - struct SbtRecord { - char header[OPTIX_SBT_RECORD_HEADER_SIZE]; - }; - - // Information stored about CUDA memory allocations - struct CUDAMem { - bool free_map_host = false; - CUarray array = NULL; - CUtexObject texobject = 0; - bool use_mapped_host = false; - }; - - // Helper class to manage current CUDA context - struct CUDAContextScope { - CUDAContextScope(CUcontext ctx) - { - cuCtxPushCurrent(ctx); - } - ~CUDAContextScope() - { - cuCtxPopCurrent(NULL); - } - }; - - // Use a pool with multiple threads to support launches with multiple CUDA streams - TaskPool task_pool; - - vector<CUstream> cuda_stream; - OptixDeviceContext context = NULL; - - OptixModule optix_module = NULL; // All necessary OptiX kernels are in one module - OptixModule builtin_modules[2] = {}; - OptixPipeline pipelines[NUM_PIPELINES] = {}; - - bool motion_blur = false; - device_vector<SbtRecord> sbt_data; - device_only_memory<KernelParams> launch_params; - OptixTraversableHandle tlas_handle = 0; - - OptixDenoiser denoiser = NULL; - device_only_memory<unsigned char> denoiser_state; - int denoiser_input_passes = 0; - - vector<device_only_memory<char>> delayed_free_bvh_memory; - thread_mutex delayed_free_bvh_mutex; - - public: - OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) - : CUDADevice(info_, stats_, profiler_, background_), - sbt_data(this, "__sbt", MEM_READ_ONLY), - launch_params(this, "__params", false), - denoiser_state(this, "__denoiser_state", true) - { - // Store number of CUDA streams in device info - info.cpu_threads = DebugFlags().optix.cuda_streams; - - // Make the CUDA context current - if (!cuContext) { - return; // Do not initialize if CUDA context creation failed already - } - const CUDAContextScope scope(cuContext); - - // Create OptiX context for this device - OptixDeviceContextOptions options = {}; -# ifdef WITH_CYCLES_LOGGING - options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4 - options.logCallbackFunction = - [](unsigned int level, const char *, const char *message, void *) { - switch (level) { - case 1: - LOG_IF(FATAL, VLOG_IS_ON(1)) << message; - break; - case 2: - LOG_IF(ERROR, VLOG_IS_ON(1)) << message; - break; - case 3: - LOG_IF(WARNING, VLOG_IS_ON(1)) << message; - break; - case 4: - LOG_IF(INFO, VLOG_IS_ON(1)) << message; - break; - } - }; -# endif - check_result_optix(optixDeviceContextCreate(cuContext, &options, &context)); -# ifdef WITH_CYCLES_LOGGING - check_result_optix(optixDeviceContextSetLogCallback( - context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel)); -# endif - - // Create launch streams - cuda_stream.resize(info.cpu_threads); - for (int i = 0; i < info.cpu_threads; ++i) - check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING)); - - // Fix weird compiler bug that assigns wrong size - launch_params.data_elements = sizeof(KernelParams); - // Allocate launch parameter buffer memory on device - launch_params.alloc_to_device(info.cpu_threads); - } - ~OptiXDevice() - { - // Stop processing any more tasks - task_pool.cancel(); - - // Make CUDA context current - const CUDAContextScope scope(cuContext); - - free_bvh_memory_delayed(); - - sbt_data.free(); - texture_info.free(); - launch_params.free(); - denoiser_state.free(); - - // Unload modules - if (optix_module != NULL) - optixModuleDestroy(optix_module); - for (unsigned int i = 0; i < 2; ++i) - if (builtin_modules[i] != NULL) - optixModuleDestroy(builtin_modules[i]); - for (unsigned int i = 0; i < NUM_PIPELINES; ++i) - if (pipelines[i] != NULL) - optixPipelineDestroy(pipelines[i]); - - // Destroy launch streams - for (CUstream stream : cuda_stream) - cuStreamDestroy(stream); - - if (denoiser != NULL) - optixDenoiserDestroy(denoiser); - - optixDeviceContextDestroy(context); - } - - private: - bool show_samples() const override - { - // Only show samples if not rendering multiple tiles in parallel - return info.cpu_threads == 1; - } - - BVHLayoutMask get_bvh_layout_mask() const override - { - // CUDA kernels are used when doing baking, so need to build a BVH those can understand too! - if (optix_module == NULL) - return CUDADevice::get_bvh_layout_mask(); - - // OptiX has its own internal acceleration structure format - return BVH_LAYOUT_OPTIX; - } - - string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features, - bool filter, - bool /*split*/) override - { - // Split kernel is not supported in OptiX - string common_cflags = CUDADevice::compile_kernel_get_common_cflags( - requested_features, filter, false); - - // Add OptiX SDK include directory to include paths - const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR"); - if (optix_sdk_path) { - common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path); - } - - // Specialization for shader raytracing - if (requested_features.use_shader_raytrace) { - common_cflags += " --keep-device-functions"; - } - else { - common_cflags += " -D __NO_SHADER_RAYTRACE__"; - } - - return common_cflags; - } - - bool load_kernels(const DeviceRequestedFeatures &requested_features) override - { - if (have_error()) { - // Abort early if context creation failed already - return false; - } - - // Load CUDA modules because we need some of the utility kernels - if (!CUDADevice::load_kernels(requested_features)) { - return false; - } - - // Baking is currently performed using CUDA, so no need to load OptiX kernels - if (requested_features.use_baking) { - return true; - } - - const CUDAContextScope scope(cuContext); - - // Unload existing OptiX module and pipelines first - if (optix_module != NULL) { - optixModuleDestroy(optix_module); - optix_module = NULL; - } - for (unsigned int i = 0; i < 2; ++i) { - if (builtin_modules[i] != NULL) { - optixModuleDestroy(builtin_modules[i]); - builtin_modules[i] = NULL; - } - } - for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { - if (pipelines[i] != NULL) { - optixPipelineDestroy(pipelines[i]); - pipelines[i] = NULL; - } - } - - OptixModuleCompileOptions module_options = {}; - module_options.maxRegisterCount = 0; // Do not set an explicit register limit - module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3; - module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; - -# if OPTIX_ABI_VERSION >= 41 - module_options.boundValues = nullptr; - module_options.numBoundValues = 0; -# endif - - OptixPipelineCompileOptions pipeline_options = {}; - // Default to no motion blur and two-level graph, since it is the fastest option - pipeline_options.usesMotionBlur = false; - pipeline_options.traversableGraphFlags = - OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING; - pipeline_options.numPayloadValues = 6; - pipeline_options.numAttributeValues = 2; // u, v - pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE; - pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h - -# if OPTIX_ABI_VERSION >= 36 - pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; - if (requested_features.use_hair) { - if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) { - pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE; - } - else { - pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM; - } - } -# endif - - // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds - // This is necessary since objects may be reported to have motion if the Vector pass is - // active, but may still need to be rendered without motion blur if that isn't active as well - motion_blur = requested_features.use_object_motion; - - if (motion_blur) { - pipeline_options.usesMotionBlur = true; - // Motion blur can insert motion transforms into the traversal graph - // It is no longer a two-level graph then, so need to set flags to allow any configuration - pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY; - } - - { // Load and compile PTX module with OptiX kernels - string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ? - "lib/kernel_optix_shader_raytrace.ptx" : - "lib/kernel_optix.ptx"); - if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { - if (!getenv("OPTIX_ROOT_DIR")) { - set_error( - "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to " - "the Optix SDK to be able to compile Optix kernels on demand)."); - return false; - } - ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true); - } - if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) { - set_error("Failed to load OptiX kernel from '" + ptx_filename + "'"); - return false; - } - - check_result_optix_ret(optixModuleCreateFromPTX(context, - &module_options, - &pipeline_options, - ptx_data.data(), - ptx_data.size(), - nullptr, - 0, - &optix_module)); - } - - // Create program groups - OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; - OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {}; - OptixProgramGroupOptions group_options = {}; // There are no options currently - group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_RGEN].raygen.module = optix_module; - // Ignore branched integrator for now (see "requested_features.use_integrator_branched") - group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace"; - group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS; - group_descs[PG_MISS].miss.module = optix_module; - group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss"; - group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; - group_descs[PG_HITD].hitgroup.moduleCH = optix_module; - group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit"; - group_descs[PG_HITD].hitgroup.moduleAH = optix_module; - group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test"; - group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; - group_descs[PG_HITS].hitgroup.moduleAH = optix_module; - group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit"; - - if (requested_features.use_hair) { - group_descs[PG_HITD].hitgroup.moduleIS = optix_module; - group_descs[PG_HITS].hitgroup.moduleIS = optix_module; - - // Add curve intersection programs - if (requested_features.use_hair_thick) { - // Slower programs for thick hair since that also slows down ribbons. - // Ideally this should not be needed. - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all"; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all"; - } - else { - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; - } - -# if OPTIX_ABI_VERSION >= 36 - if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) { - OptixBuiltinISOptions builtin_options = {}; - builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; - builtin_options.usesMotionBlur = false; - - check_result_optix_ret(optixBuiltinISModuleGet( - context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0])); - - group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0]; - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr; - group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0]; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr; - - if (motion_blur) { - builtin_options.usesMotionBlur = true; - - check_result_optix_ret(optixBuiltinISModuleGet( - context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1])); - - group_descs[PG_HITD_MOTION] = group_descs[PG_HITD]; - group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1]; - group_descs[PG_HITS_MOTION] = group_descs[PG_HITS]; - group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1]; - } - } -# endif - } - - if (requested_features.use_subsurface || requested_features.use_shader_raytrace) { - // Add hit group for local intersections - group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; - group_descs[PG_HITL].hitgroup.moduleAH = optix_module; - group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit"; - } - - if (requested_features.use_baking) { - group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_BAKE].raygen.module = optix_module; - group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake"; - } - - if (requested_features.use_true_displacement) { - group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_DISP].raygen.module = optix_module; - group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace"; - } - - if (requested_features.use_background_light) { - group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_BACK].raygen.module = optix_module; - group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background"; - } - - // Shader raytracing replaces some functions with direct callables - if (requested_features.use_shader_raytrace) { - group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL + 0].callables.moduleDC = optix_module; - group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes"; - group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL + 1].callables.moduleDC = optix_module; - group_descs[PG_CALL + 1].callables.entryFunctionNameDC = - "__direct_callable__kernel_volume_shadow"; - group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL + 2].callables.moduleDC = optix_module; - group_descs[PG_CALL + 2].callables.entryFunctionNameDC = - "__direct_callable__subsurface_scatter_multi_setup"; - } - - check_result_optix_ret(optixProgramGroupCreate( - context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); - - // Get program stack sizes - OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {}; - // Set up SBT, which in this case is used only to select between different programs - sbt_data.alloc(NUM_PROGRAM_GROUPS); - memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS); - for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { - check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i])); - check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i])); - } - sbt_data.copy_to_device(); // Upload SBT to device - - // Calculate maximum trace continuation stack size - unsigned int trace_css = stack_size[PG_HITD].cssCH; - // This is based on the maximum of closest-hit and any-hit/intersection programs - trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH); - trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH); - trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH); -# if OPTIX_ABI_VERSION >= 36 - trace_css = std::max(trace_css, - stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH); - trace_css = std::max(trace_css, - stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH); -# endif - - OptixPipelineLinkOptions link_options = {}; - link_options.maxTraceDepth = 1; - link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; -# if OPTIX_ABI_VERSION < 24 - link_options.overrideUsesMotionBlur = motion_blur; -# endif - - { // Create path tracing pipeline - vector<OptixProgramGroup> pipeline_groups; - pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_RGEN]); - pipeline_groups.push_back(groups[PG_MISS]); - pipeline_groups.push_back(groups[PG_HITD]); - pipeline_groups.push_back(groups[PG_HITS]); - pipeline_groups.push_back(groups[PG_HITL]); -# if OPTIX_ABI_VERSION >= 36 - if (motion_blur) { - pipeline_groups.push_back(groups[PG_HITD_MOTION]); - pipeline_groups.push_back(groups[PG_HITS_MOTION]); - } -# endif - if (requested_features.use_shader_raytrace) { - pipeline_groups.push_back(groups[PG_CALL + 0]); - pipeline_groups.push_back(groups[PG_CALL + 1]); - pipeline_groups.push_back(groups[PG_CALL + 2]); - } - - check_result_optix_ret(optixPipelineCreate(context, - &pipeline_options, - &link_options, - pipeline_groups.data(), - pipeline_groups.size(), - nullptr, - 0, - &pipelines[PIP_PATH_TRACE])); - - // Combine ray generation and trace continuation stack size - const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css; - // Max direct callable depth is one of the following, so combine accordingly - // - __raygen__ -> svm_eval_nodes - // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes - // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes - const unsigned int dss = stack_size[PG_CALL + 0].dssDC + - std::max(stack_size[PG_CALL + 1].dssDC, - stack_size[PG_CALL + 2].dssDC); - - // Set stack size depending on pipeline options - check_result_optix_ret( - optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], - 0, - requested_features.use_shader_raytrace ? dss : 0, - css, - motion_blur ? 3 : 2)); - } - - // Only need to create shader evaluation pipeline if one of these features is used: - const bool use_shader_eval_pipeline = requested_features.use_baking || - requested_features.use_background_light || - requested_features.use_true_displacement; - - if (use_shader_eval_pipeline) { // Create shader evaluation pipeline - vector<OptixProgramGroup> pipeline_groups; - pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_BAKE]); - pipeline_groups.push_back(groups[PG_DISP]); - pipeline_groups.push_back(groups[PG_BACK]); - pipeline_groups.push_back(groups[PG_MISS]); - pipeline_groups.push_back(groups[PG_HITD]); - pipeline_groups.push_back(groups[PG_HITS]); - pipeline_groups.push_back(groups[PG_HITL]); -# if OPTIX_ABI_VERSION >= 36 - if (motion_blur) { - pipeline_groups.push_back(groups[PG_HITD_MOTION]); - pipeline_groups.push_back(groups[PG_HITS_MOTION]); - } -# endif - if (requested_features.use_shader_raytrace) { - pipeline_groups.push_back(groups[PG_CALL + 0]); - pipeline_groups.push_back(groups[PG_CALL + 1]); - pipeline_groups.push_back(groups[PG_CALL + 2]); - } - - check_result_optix_ret(optixPipelineCreate(context, - &pipeline_options, - &link_options, - pipeline_groups.data(), - pipeline_groups.size(), - nullptr, - 0, - &pipelines[PIP_SHADER_EVAL])); - - // Calculate continuation stack size based on the maximum of all ray generation stack sizes - const unsigned int css = std::max(stack_size[PG_BAKE].cssRG, - std::max(stack_size[PG_DISP].cssRG, - stack_size[PG_BACK].cssRG)) + - link_options.maxTraceDepth * trace_css; - const unsigned int dss = stack_size[PG_CALL + 0].dssDC + - std::max(stack_size[PG_CALL + 1].dssDC, - stack_size[PG_CALL + 2].dssDC); - - check_result_optix_ret( - optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL], - 0, - requested_features.use_shader_raytrace ? dss : 0, - css, - motion_blur ? 3 : 2)); - } - - // Clean up program group objects - for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { - optixProgramGroupDestroy(groups[i]); - } - - return true; - } - - void thread_run(DeviceTask &task, int thread_index) // Main task entry point - { - if (have_error()) - return; // Abort early if there was an error previously - - if (task.type == DeviceTask::RENDER) { - if (thread_index != 0) { - // Only execute denoising in a single thread (see also 'task_add') - task.tile_types &= ~RenderTile::DENOISE; - } - - RenderTile tile; - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) - launch_render(task, tile, thread_index); - else if (tile.task == RenderTile::BAKE) { - // Perform baking using CUDA, since it is not currently implemented in OptiX - device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - CUDADevice::render(task, tile, work_tiles); - } - else if (tile.task == RenderTile::DENOISE) - launch_denoise(task, tile); - task.release_tile(tile); - if (task.get_cancel() && !task.need_finish_queue) - break; // User requested cancellation - else if (have_error()) - break; // Abort rendering when encountering an error - } - } - else if (task.type == DeviceTask::SHADER) { - // CUDA kernels are used when doing baking - if (optix_module == NULL) - CUDADevice::shader(task); - else - launch_shader_eval(task, thread_index); - } - else if (task.type == DeviceTask::DENOISE_BUFFER) { - // Set up a single tile that covers the whole task and denoise it - RenderTile tile; - tile.x = task.x; - tile.y = task.y; - tile.w = task.w; - tile.h = task.h; - tile.buffer = task.buffer; - tile.num_samples = task.num_samples; - tile.start_sample = task.sample; - tile.offset = task.offset; - tile.stride = task.stride; - tile.buffers = task.buffers; - - launch_denoise(task, tile); - } - } - - void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index) - { - assert(thread_index < launch_params.data_size); - - // Keep track of total render time of this tile - const scoped_timer timer(&rtile.buffers->render_time); - - WorkTile wtile; - wtile.x = rtile.x; - wtile.y = rtile.y; - wtile.w = rtile.w; - wtile.h = rtile.h; - wtile.offset = rtile.offset; - wtile.stride = rtile.stride; - wtile.buffer = (float *)rtile.buffer; - - const int end_sample = rtile.start_sample + rtile.num_samples; - // Keep this number reasonable to avoid running into TDRs - int step_samples = (info.display_device ? 8 : 32); - - // Offset into launch params buffer so that streams use separate data - device_ptr launch_params_ptr = launch_params.device_pointer + - thread_index * launch_params.data_elements; - - const CUDAContextScope scope(cuContext); - - for (int sample = rtile.start_sample; sample < end_sample;) { - // Copy work tile information to device - wtile.start_sample = sample; - wtile.num_samples = step_samples; - if (task.adaptive_sampling.use) { - wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples); - } - wtile.num_samples = min(wtile.num_samples, end_sample - sample); - device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); - check_result_cuda( - cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index])); - - OptixShaderBindingTable sbt_params = {}; - sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord); - sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord); - sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); - sbt_params.missRecordCount = 1; - sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord); - sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); -# if OPTIX_ABI_VERSION >= 36 - sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL -# else - sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL -# endif - sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord); - sbt_params.callablesRecordCount = 3; - sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); - - // Launch the ray generation program - check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE], - cuda_stream[thread_index], - launch_params_ptr, - launch_params.data_elements, - &sbt_params, - // Launch with samples close to each other for better locality - wtile.w * wtile.num_samples, - wtile.h, - 1)); - - // Run the adaptive sampling kernels at selected samples aligned to step samples. - uint filter_sample = wtile.start_sample + wtile.num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]); - } - - // Wait for launch to finish - check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); - - // Update current sample, so it is displayed correctly - sample += wtile.num_samples; - rtile.sample = sample; - // Update task progress after the kernel completed rendering - task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples); - - if (task.get_cancel() && !task.need_finish_queue) - return; // Cancel rendering - } - - // Finalize adaptive sampling - if (task.adaptive_sampling.use) { - device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); - adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]); - check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); - task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples); - } - } - - bool launch_denoise(DeviceTask &task, RenderTile &rtile) - { - // Update current sample (for display and NLM denoising task) - rtile.sample = rtile.start_sample + rtile.num_samples; - - // Make CUDA context current now, since it is used for both denoising tasks - const CUDAContextScope scope(cuContext); - - // Choose between OptiX and NLM denoising - if (task.denoising.type == DENOISER_OPTIX) { - // Map neighboring tiles onto this device, indices are as following: - // Where index 4 is the center tile and index 9 is the target for the result. - // 0 1 2 - // 3 4 5 - // 6 7 8 9 - RenderTileNeighbors neighbors(rtile); - task.map_neighbor_tiles(neighbors, this); - RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER]; - RenderTile &target_tile = neighbors.target; - rtile = center_tile; // Tile may have been modified by mapping code - - // Calculate size of the tile to denoise (including overlap) - int4 rect = center_tile.bounds(); - // Overlap between tiles has to be at least 64 pixels - // TODO(pmours): Query this value from OptiX - rect = rect_expand(rect, 64); - int4 clip_rect = neighbors.bounds(); - rect = rect_clip(rect, clip_rect); - int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); - int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y); - - // Calculate byte offsets and strides - int pixel_stride = task.pass_stride * (int)sizeof(float); - int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride; - const int pass_offset[3] = { - (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float), - (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float), - (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)}; - - // Start with the current tile pointer offset - int input_stride = pixel_stride; - device_ptr input_ptr = rtile.buffer + pixel_offset; - - // Copy tile data into a common buffer if necessary - device_only_memory<float> input(this, "denoiser input", true); - device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY); - - bool contiguous_memory = true; - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) { - contiguous_memory = false; - } - } - - if (contiguous_memory) { - // Tiles are in continous memory, so can just subtract overlap offset - input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride; - // Stride covers the whole width of the image and not just a single tile - input_stride *= rtile.stride; - } - else { - // Adjacent tiles are in separate memory regions, so need to copy them into a single one - input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride); - // Start with the new input buffer - input_ptr = input.device_pointer; - // Stride covers the width of the new input buffer, which includes tile width and overlap - input_stride *= rect_size.x; - - TileInfo *tile_info = tile_info_mem.alloc(1); - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - tile_info->offsets[i] = neighbors.tiles[i].offset; - tile_info->strides[i] = neighbors.tiles[i].stride; - tile_info->buffers[i] = neighbors.tiles[i].buffer; - } - tile_info->x[0] = neighbors.tiles[3].x; - tile_info->x[1] = neighbors.tiles[4].x; - tile_info->x[2] = neighbors.tiles[5].x; - tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w; - tile_info->y[0] = neighbors.tiles[1].y; - tile_info->y[1] = neighbors.tiles[4].y; - tile_info->y[2] = neighbors.tiles[7].y; - tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h; - tile_info_mem.copy_to_device(); - - void *args[] = { - &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride}; - launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args); - } - -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - device_only_memory<float> input_rgb(this, "denoiser input rgb", true); - input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes); - - void *input_args[] = {&input_rgb.device_pointer, - &input_ptr, - &rect_size.x, - &rect_size.y, - &input_stride, - &task.pass_stride, - const_cast<int *>(pass_offset), - &task.denoising.input_passes, - &rtile.sample}; - launch_filter_kernel( - "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args); - - input_ptr = input_rgb.device_pointer; - pixel_stride = 3 * sizeof(float); - input_stride = rect_size.x * pixel_stride; -# endif - - const bool recreate_denoiser = (denoiser == NULL) || - (task.denoising.input_passes != denoiser_input_passes); - if (recreate_denoiser) { - // Destroy existing handle before creating new one - if (denoiser != NULL) { - optixDenoiserDestroy(denoiser); - } - - // Create OptiX denoiser handle on demand when it is first used - OptixDenoiserOptions denoiser_options = {}; - assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3); -# if OPTIX_ABI_VERSION >= 47 - denoiser_options.guideAlbedo = task.denoising.input_passes >= 2; - denoiser_options.guideNormal = task.denoising.input_passes >= 3; - check_result_optix_ret(optixDenoiserCreate( - context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser)); -# else - denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>( - OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1)); -# if OPTIX_ABI_VERSION < 28 - denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3; -# endif - check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser)); - check_result_optix_ret( - optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0)); -# endif - - // OptiX denoiser handle was created with the requested number of input passes - denoiser_input_passes = task.denoising.input_passes; - } - - OptixDenoiserSizes sizes = {}; - check_result_optix_ret( - optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes)); - -# if OPTIX_ABI_VERSION < 28 - const size_t scratch_size = sizes.recommendedScratchSizeInBytes; -# else - const size_t scratch_size = sizes.withOverlapScratchSizeInBytes; -# endif - const size_t scratch_offset = sizes.stateSizeInBytes; - - // Allocate denoiser state if tile size has changed since last setup - if (recreate_denoiser || (denoiser_state.data_width != rect_size.x || - denoiser_state.data_height != rect_size.y)) { - denoiser_state.alloc_to_device(scratch_offset + scratch_size); - - // Initialize denoiser state for the current tile size - check_result_optix_ret(optixDenoiserSetup(denoiser, - 0, - rect_size.x, - rect_size.y, - denoiser_state.device_pointer, - scratch_offset, - denoiser_state.device_pointer + scratch_offset, - scratch_size)); - - denoiser_state.data_width = rect_size.x; - denoiser_state.data_height = rect_size.y; - } - - // Set up input and output layer information - OptixImage2D input_layers[3] = {}; - OptixImage2D output_layers[1] = {}; - - for (int i = 0; i < 3; ++i) { -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i); -# else - input_layers[i].data = input_ptr + pass_offset[i]; -# endif - input_layers[i].width = rect_size.x; - input_layers[i].height = rect_size.y; - input_layers[i].rowStrideInBytes = input_stride; - input_layers[i].pixelStrideInBytes = pixel_stride; - input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3; - } - -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - output_layers[0].data = input_ptr; - output_layers[0].width = rect_size.x; - output_layers[0].height = rect_size.y; - output_layers[0].rowStrideInBytes = input_stride; - output_layers[0].pixelStrideInBytes = pixel_stride; - int2 output_offset = overlap_offset; - overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually -# else - output_layers[0].data = target_tile.buffer + pixel_offset; - output_layers[0].width = target_tile.w; - output_layers[0].height = target_tile.h; - output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride; - output_layers[0].pixelStrideInBytes = pixel_stride; -# endif - output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3; - -# if OPTIX_ABI_VERSION >= 47 - OptixDenoiserLayer image_layers = {}; - image_layers.input = input_layers[0]; - image_layers.output = output_layers[0]; - - OptixDenoiserGuideLayer guide_layers = {}; - guide_layers.albedo = input_layers[1]; - guide_layers.normal = input_layers[2]; -# endif - - // Finally run denonising - OptixDenoiserParams params = {}; // All parameters are disabled/zero -# if OPTIX_ABI_VERSION >= 47 - check_result_optix_ret(optixDenoiserInvoke(denoiser, - NULL, - ¶ms, - denoiser_state.device_pointer, - scratch_offset, - &guide_layers, - &image_layers, - 1, - overlap_offset.x, - overlap_offset.y, - denoiser_state.device_pointer + scratch_offset, - scratch_size)); -# else - check_result_optix_ret(optixDenoiserInvoke(denoiser, - NULL, - ¶ms, - denoiser_state.device_pointer, - scratch_offset, - input_layers, - task.denoising.input_passes, - overlap_offset.x, - overlap_offset.y, - output_layers, - denoiser_state.device_pointer + scratch_offset, - scratch_size)); -# endif - -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - void *output_args[] = {&input_ptr, - &target_tile.buffer, - &output_offset.x, - &output_offset.y, - &rect_size.x, - &rect_size.y, - &target_tile.x, - &target_tile.y, - &target_tile.w, - &target_tile.h, - &target_tile.offset, - &target_tile.stride, - &task.pass_stride, - &rtile.sample}; - launch_filter_kernel( - "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args); -# endif - - check_result_cuda_ret(cuStreamSynchronize(0)); - - task.unmap_neighbor_tiles(neighbors, this); - } - else { - // Run CUDA denoising kernels - DenoisingTask denoising(this, task); - CUDADevice::denoise(rtile, denoising); - } - - // Update task progress after the denoiser completed processing - task.update_progress(&rtile, rtile.w * rtile.h); - - return true; - } - - void launch_shader_eval(DeviceTask &task, int thread_index) - { - unsigned int rgen_index = PG_BACK; - if (task.shader_eval_type >= SHADER_EVAL_BAKE) - rgen_index = PG_BAKE; - if (task.shader_eval_type == SHADER_EVAL_DISPLACE) - rgen_index = PG_DISP; - - const CUDAContextScope scope(cuContext); - - device_ptr launch_params_ptr = launch_params.device_pointer + - thread_index * launch_params.data_elements; - - for (int sample = 0; sample < task.num_samples; ++sample) { - ShaderParams params; - params.input = (uint4 *)task.shader_input; - params.output = (float4 *)task.shader_output; - params.type = task.shader_eval_type; - params.filter = task.shader_filter; - params.sx = task.shader_x; - params.offset = task.offset; - params.sample = sample; - - check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader), - ¶ms, - sizeof(params), - cuda_stream[thread_index])); - - OptixShaderBindingTable sbt_params = {}; - sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord); - sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord); - sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); - sbt_params.missRecordCount = 1; - sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord); - sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); -# if OPTIX_ABI_VERSION >= 36 - sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL -# else - sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL -# endif - sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord); - sbt_params.callablesRecordCount = 3; - sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); - - check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL], - cuda_stream[thread_index], - launch_params_ptr, - launch_params.data_elements, - &sbt_params, - task.shader_w, - 1, - 1)); - - check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); - - task.update_progress(NULL); - } - } - - bool build_optix_bvh(BVHOptiX *bvh, - OptixBuildOperation operation, - const OptixBuildInput &build_input, - uint16_t num_motion_steps) - { - /* Allocate and build acceleration structures only one at a time, to prevent parallel builds - * from running out of memory (since both original and compacted acceleration structure memory - * may be allocated at the same time for the duration of this function). The builds would - * otherwise happen on the same CUDA stream anyway. */ - static thread_mutex mutex; - thread_scoped_lock lock(mutex); - - const CUDAContextScope scope(cuContext); - - const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC); - - // Compute memory usage - OptixAccelBufferSizes sizes = {}; - OptixAccelBuildOptions options = {}; - options.operation = operation; - if (use_fast_trace_bvh) { - VLOG(2) << "Using fast to trace OptiX BVH"; - options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION; - } - else { - VLOG(2) << "Using fast to update OptiX BVH"; - options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE; - } - - options.motionOptions.numKeys = num_motion_steps; - options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH; - options.motionOptions.timeBegin = 0.0f; - options.motionOptions.timeEnd = 1.0f; - - check_result_optix_ret( - optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); - - // Allocate required output buffers - device_only_memory<char> temp_mem(this, "optix temp as build mem", true); - temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8); - if (!temp_mem.device_pointer) - return false; // Make sure temporary memory allocation succeeded - - // Acceleration structure memory has to be allocated on the device (not allowed to be on host) - device_only_memory<char> &out_data = bvh->as_data; - if (operation == OPTIX_BUILD_OPERATION_BUILD) { - assert(out_data.device == this); - out_data.alloc_to_device(sizes.outputSizeInBytes); - if (!out_data.device_pointer) - return false; - } - else { - assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes); - } - - // Finally build the acceleration structure - OptixAccelEmitDesc compacted_size_prop = {}; - compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE; - // A tiny space was allocated for this property at the end of the temporary buffer above - // Make sure this pointer is 8-byte aligned - compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8); - - OptixTraversableHandle out_handle = 0; - check_result_optix_ret(optixAccelBuild(context, - NULL, - &options, - &build_input, - 1, - temp_mem.device_pointer, - sizes.tempSizeInBytes, - out_data.device_pointer, - sizes.outputSizeInBytes, - &out_handle, - use_fast_trace_bvh ? &compacted_size_prop : NULL, - use_fast_trace_bvh ? 1 : 0)); - bvh->traversable_handle = static_cast<uint64_t>(out_handle); - - // Wait for all operations to finish - check_result_cuda_ret(cuStreamSynchronize(NULL)); - - // Compact acceleration structure to save memory (only if using fast trace as the - // OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case). - if (use_fast_trace_bvh) { - uint64_t compacted_size = sizes.outputSizeInBytes; - check_result_cuda_ret( - cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size))); - - // Temporary memory is no longer needed, so free it now to make space - temp_mem.free(); - - // There is no point compacting if the size does not change - if (compacted_size < sizes.outputSizeInBytes) { - device_only_memory<char> compacted_data(this, "optix compacted as", false); - compacted_data.alloc_to_device(compacted_size); - if (!compacted_data.device_pointer) - // Do not compact if memory allocation for compacted acceleration structure fails - // Can just use the uncompacted one then, so succeed here regardless - return true; - - check_result_optix_ret(optixAccelCompact(context, - NULL, - out_handle, - compacted_data.device_pointer, - compacted_size, - &out_handle)); - bvh->traversable_handle = static_cast<uint64_t>(out_handle); - - // Wait for compaction to finish - check_result_cuda_ret(cuStreamSynchronize(NULL)); - - std::swap(out_data.device_size, compacted_data.device_size); - std::swap(out_data.device_pointer, compacted_data.device_pointer); - // Original acceleration structure memory is freed when 'compacted_data' goes out of scope - } - } - - return true; - } - - void build_bvh(BVH *bvh, Progress &progress, bool refit) override - { - if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) { - /* For baking CUDA is used, build appropriate BVH for that. */ - Device::build_bvh(bvh, progress, refit); - return; - } - - const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC); - - free_bvh_memory_delayed(); - - BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh); - - progress.set_substatus("Building OptiX acceleration structure"); - - if (!bvh->params.top_level) { - assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1); - - OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD; - /* Refit is only possible when using fast to trace BVH (because AS is built with - * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */ - if (refit && !use_fast_trace_bvh) { - assert(bvh_optix->traversable_handle != 0); - operation = OPTIX_BUILD_OPERATION_UPDATE; - } - else { - bvh_optix->as_data.free(); - bvh_optix->traversable_handle = 0; - } - - // Build bottom level acceleration structures (BLAS) - Geometry *const geom = bvh->geometry[0]; - if (geom->geometry_type == Geometry::HAIR) { - // Build BLAS for curve primitives - Hair *const hair = static_cast<Hair *const>(geom); - if (hair->num_curves() == 0) { - return; - } - - const size_t num_segments = hair->num_segments(); - - size_t num_motion_steps = 1; - Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && hair->get_use_motion_blur() && motion_keys) { - num_motion_steps = hair->get_motion_steps(); - } - - device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY); -# if OPTIX_ABI_VERSION >= 36 - device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); - device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); - // Four control points for each curve segment - const size_t num_vertices = num_segments * 4; - if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { - index_data.alloc(num_segments); - vertex_data.alloc(num_vertices * num_motion_steps); - } - else -# endif - aabb_data.alloc(num_segments * num_motion_steps); - - // Get AABBs for each motion step - for (size_t step = 0; step < num_motion_steps; ++step) { - // The center step for motion vertices is not stored in the attribute - const float3 *keys = hair->get_curve_keys().data(); - size_t center_step = (num_motion_steps - 1) / 2; - if (step != center_step) { - size_t attr_offset = (step > center_step) ? step - 1 : step; - // Technically this is a float4 array, but sizeof(float3) == sizeof(float4) - keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size(); - } - - for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) { - const Hair::Curve curve = hair->get_curve(j); -# if OPTIX_ABI_VERSION >= 36 - const array<float> &curve_radius = hair->get_curve_radius(); -# endif - - for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) { -# if OPTIX_ABI_VERSION >= 36 - if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { - int k0 = curve.first_key + segment; - int k1 = k0 + 1; - int ka = max(k0 - 1, curve.first_key); - int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1); - - const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x); - const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y); - const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z); - const float4 pw = make_float4( - curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]); - - // Convert Catmull-Rom data to Bezier spline - static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f; - static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f; - static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f; - static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f; - - index_data[i] = i * 4; - float4 *const v = vertex_data.data() + step * num_vertices + index_data[i]; - v[0] = make_float4( - dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw)); - v[1] = make_float4( - dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw)); - v[2] = make_float4( - dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw)); - v[3] = make_float4( - dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw)); - } - else -# endif - { - BoundBox bounds = BoundBox::empty; - curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds); - - const size_t index = step * num_segments + i; - aabb_data[index].minX = bounds.min.x; - aabb_data[index].minY = bounds.min.y; - aabb_data[index].minZ = bounds.min.z; - aabb_data[index].maxX = bounds.max.x; - aabb_data[index].maxY = bounds.max.y; - aabb_data[index].maxZ = bounds.max.z; - } - } - } - } - - // Upload AABB data to GPU - aabb_data.copy_to_device(); -# if OPTIX_ABI_VERSION >= 36 - index_data.copy_to_device(); - vertex_data.copy_to_device(); -# endif - - vector<device_ptr> aabb_ptrs; - aabb_ptrs.reserve(num_motion_steps); -# if OPTIX_ABI_VERSION >= 36 - vector<device_ptr> width_ptrs; - vector<device_ptr> vertex_ptrs; - width_ptrs.reserve(num_motion_steps); - vertex_ptrs.reserve(num_motion_steps); -# endif - for (size_t step = 0; step < num_motion_steps; ++step) { - aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb)); -# if OPTIX_ABI_VERSION >= 36 - const device_ptr base_ptr = vertex_data.device_pointer + - step * num_vertices * sizeof(float4); - width_ptrs.push_back(base_ptr + 3 * sizeof(float)); // Offset by vertex size - vertex_ptrs.push_back(base_ptr); -# endif - } - - // Force a single any-hit call, so shadow record-all behavior works correctly - unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; - OptixBuildInput build_input = {}; -# if OPTIX_ABI_VERSION >= 36 - if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { - build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES; - build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; - build_input.curveArray.numPrimitives = num_segments; - build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); - build_input.curveArray.numVertices = num_vertices; - build_input.curveArray.vertexStrideInBytes = sizeof(float4); - build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data(); - build_input.curveArray.widthStrideInBytes = sizeof(float4); - build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer; - build_input.curveArray.indexStrideInBytes = sizeof(int); - build_input.curveArray.flag = build_flags; - build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset; - } - else -# endif - { - // Disable visibility test any-hit program, since it is already checked during - // intersection. Those trace calls that require anyhit can force it with a ray flag. - build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT; - - build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; -# if OPTIX_ABI_VERSION < 23 - build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); - build_input.aabbArray.numPrimitives = num_segments; - build_input.aabbArray.strideInBytes = sizeof(OptixAabb); - build_input.aabbArray.flags = &build_flags; - build_input.aabbArray.numSbtRecords = 1; - build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset; -# else - build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); - build_input.customPrimitiveArray.numPrimitives = num_segments; - build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb); - build_input.customPrimitiveArray.flags = &build_flags; - build_input.customPrimitiveArray.numSbtRecords = 1; - build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset; -# endif - } - - if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { - progress.set_error("Failed to build OptiX acceleration structure"); - } - } - else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) { - // Build BLAS for triangle primitives - Mesh *const mesh = static_cast<Mesh *const>(geom); - if (mesh->num_triangles() == 0) { - return; - } - - const size_t num_verts = mesh->get_verts().size(); - - size_t num_motion_steps = 1; - Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && mesh->get_use_motion_blur() && motion_keys) { - num_motion_steps = mesh->get_motion_steps(); - } - - device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); - index_data.alloc(mesh->get_triangles().size()); - memcpy(index_data.data(), - mesh->get_triangles().data(), - mesh->get_triangles().size() * sizeof(int)); - device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); - vertex_data.alloc(num_verts * num_motion_steps); - - for (size_t step = 0; step < num_motion_steps; ++step) { - const float3 *verts = mesh->get_verts().data(); - - size_t center_step = (num_motion_steps - 1) / 2; - // The center step for motion vertices is not stored in the attribute - if (step != center_step) { - verts = motion_keys->data_float3() + - (step > center_step ? step - 1 : step) * num_verts; - } - - memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3)); - } - - // Upload triangle data to GPU - index_data.copy_to_device(); - vertex_data.copy_to_device(); - - vector<device_ptr> vertex_ptrs; - vertex_ptrs.reserve(num_motion_steps); - for (size_t step = 0; step < num_motion_steps; ++step) { - vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3)); - } - - // Force a single any-hit call, so shadow record-all behavior works correctly - unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; - OptixBuildInput build_input = {}; - build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES; - build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); - build_input.triangleArray.numVertices = num_verts; - build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3; - build_input.triangleArray.vertexStrideInBytes = sizeof(float3); - build_input.triangleArray.indexBuffer = index_data.device_pointer; - build_input.triangleArray.numIndexTriplets = mesh->num_triangles(); - build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3; - build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int); - build_input.triangleArray.flags = &build_flags; - // The SBT does not store per primitive data since Cycles already allocates separate - // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in - // one and rely on that having the same meaning in this case. - build_input.triangleArray.numSbtRecords = 1; - build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset; - - if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { - progress.set_error("Failed to build OptiX acceleration structure"); - } - } - } - else { - unsigned int num_instances = 0; - unsigned int max_num_instances = 0xFFFFFFFF; - - bvh_optix->as_data.free(); - bvh_optix->traversable_handle = 0; - bvh_optix->motion_transform_data.free(); - - optixDeviceContextGetProperty(context, - OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID, - &max_num_instances, - sizeof(max_num_instances)); - // Do not count first bit, which is used to distinguish instanced and non-instanced objects - max_num_instances >>= 1; - if (bvh->objects.size() > max_num_instances) { - progress.set_error( - "Failed to build OptiX acceleration structure because there are too many instances"); - return; - } - - // Fill instance descriptions -# if OPTIX_ABI_VERSION < 41 - device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY); - aabbs.alloc(bvh->objects.size()); -# endif - device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY); - instances.alloc(bvh->objects.size()); - - // Calculate total motion transform size and allocate memory for them - size_t motion_transform_offset = 0; - if (motion_blur) { - size_t total_motion_transform_size = 0; - for (Object *const ob : bvh->objects) { - if (ob->is_traceable() && ob->use_motion()) { - total_motion_transform_size = align_up(total_motion_transform_size, - OPTIX_TRANSFORM_BYTE_ALIGNMENT); - const size_t motion_keys = max(ob->get_motion().size(), 2) - 2; - total_motion_transform_size = total_motion_transform_size + - sizeof(OptixSRTMotionTransform) + - motion_keys * sizeof(OptixSRTData); - } - } - - assert(bvh_optix->motion_transform_data.device == this); - bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size); - } - - for (Object *ob : bvh->objects) { - // Skip non-traceable objects - if (!ob->is_traceable()) - continue; - - BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh); - OptixTraversableHandle handle = blas->traversable_handle; - -# if OPTIX_ABI_VERSION < 41 - OptixAabb &aabb = aabbs[num_instances]; - aabb.minX = ob->bounds.min.x; - aabb.minY = ob->bounds.min.y; - aabb.minZ = ob->bounds.min.z; - aabb.maxX = ob->bounds.max.x; - aabb.maxY = ob->bounds.max.y; - aabb.maxZ = ob->bounds.max.z; -# endif - - OptixInstance &instance = instances[num_instances++]; - memset(&instance, 0, sizeof(instance)); - - // Clear transform to identity matrix - instance.transform[0] = 1.0f; - instance.transform[5] = 1.0f; - instance.transform[10] = 1.0f; - - // Set user instance ID to object index (but leave low bit blank) - instance.instanceId = ob->get_device_index() << 1; - - // Have to have at least one bit in the mask, or else instance would always be culled - instance.visibilityMask = 1; - - if (ob->get_geometry()->has_volume) { - // Volumes have a special bit set in the visibility mask so a trace can mask only volumes - instance.visibilityMask |= 2; - } - - if (ob->get_geometry()->geometry_type == Geometry::HAIR) { - // Same applies to curves (so they can be skipped in local trace calls) - instance.visibilityMask |= 4; - -# if OPTIX_ABI_VERSION >= 36 - if (motion_blur && ob->get_geometry()->has_motion_blur() && - DebugFlags().optix.curves_api && - static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { - // Select between motion blur and non-motion blur built-in intersection module - instance.sbtOffset = PG_HITD_MOTION - PG_HITD; - } -# endif - } - - // Insert motion traversable if object has motion - if (motion_blur && ob->use_motion()) { - size_t motion_keys = max(ob->get_motion().size(), 2) - 2; - size_t motion_transform_size = sizeof(OptixSRTMotionTransform) + - motion_keys * sizeof(OptixSRTData); - - const CUDAContextScope scope(cuContext); - - motion_transform_offset = align_up(motion_transform_offset, - OPTIX_TRANSFORM_BYTE_ALIGNMENT); - CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer + - motion_transform_offset; - motion_transform_offset += motion_transform_size; - - // Allocate host side memory for motion transform and fill it with transform data - OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>( - new uint8_t[motion_transform_size]); - motion_transform.child = handle; - motion_transform.motionOptions.numKeys = ob->get_motion().size(); - motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE; - motion_transform.motionOptions.timeBegin = 0.0f; - motion_transform.motionOptions.timeEnd = 1.0f; - - OptixSRTData *const srt_data = motion_transform.srtData; - array<DecomposedTransform> decomp(ob->get_motion().size()); - transform_motion_decompose( - decomp.data(), ob->get_motion().data(), ob->get_motion().size()); - - for (size_t i = 0; i < ob->get_motion().size(); ++i) { - // Scale - srt_data[i].sx = decomp[i].y.w; // scale.x.x - srt_data[i].sy = decomp[i].z.w; // scale.y.y - srt_data[i].sz = decomp[i].w.w; // scale.z.z - - // Shear - srt_data[i].a = decomp[i].z.x; // scale.x.y - srt_data[i].b = decomp[i].z.y; // scale.x.z - srt_data[i].c = decomp[i].w.x; // scale.y.z - assert(decomp[i].z.z == 0.0f); // scale.y.x - assert(decomp[i].w.y == 0.0f); // scale.z.x - assert(decomp[i].w.z == 0.0f); // scale.z.y - - // Pivot point - srt_data[i].pvx = 0.0f; - srt_data[i].pvy = 0.0f; - srt_data[i].pvz = 0.0f; - - // Rotation - srt_data[i].qx = decomp[i].x.x; - srt_data[i].qy = decomp[i].x.y; - srt_data[i].qz = decomp[i].x.z; - srt_data[i].qw = decomp[i].x.w; - - // Translation - srt_data[i].tx = decomp[i].y.x; - srt_data[i].ty = decomp[i].y.y; - srt_data[i].tz = decomp[i].y.z; - } - - // Upload motion transform to GPU - cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size); - delete[] reinterpret_cast<uint8_t *>(&motion_transform); - - // Disable instance transform if object uses motion transform already - instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; - - // Get traversable handle to motion transform - optixConvertPointerToTraversableHandle(context, - motion_transform_gpu, - OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM, - &instance.traversableHandle); - } - else { - instance.traversableHandle = handle; - - if (ob->get_geometry()->is_instanced()) { - // Set transform matrix - memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform)); - } - else { - // Disable instance transform if geometry already has it applied to vertex data - instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; - // Non-instanced objects read ID from 'prim_object', so distinguish - // them from instanced objects with the low bit set - instance.instanceId |= 1; - } - } - } - - // Upload instance descriptions -# if OPTIX_ABI_VERSION < 41 - aabbs.resize(num_instances); - aabbs.copy_to_device(); -# endif - instances.resize(num_instances); - instances.copy_to_device(); - - // Build top-level acceleration structure (TLAS) - OptixBuildInput build_input = {}; - build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES; -# if OPTIX_ABI_VERSION < 41 // Instance AABBs no longer need to be set since OptiX 7.2 - build_input.instanceArray.aabbs = aabbs.device_pointer; - build_input.instanceArray.numAabbs = num_instances; -# endif - build_input.instanceArray.instances = instances.device_pointer; - build_input.instanceArray.numInstances = num_instances; - - if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) { - progress.set_error("Failed to build OptiX acceleration structure"); - } - tlas_handle = bvh_optix->traversable_handle; - } - } - - void release_optix_bvh(BVH *bvh) override - { - thread_scoped_lock lock(delayed_free_bvh_mutex); - /* Do delayed free of BVH memory, since geometry holding BVH might be deleted - * while GPU is still rendering. */ - BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh); - - delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data)); - delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data)); - bvh_optix->traversable_handle = 0; - } - - void free_bvh_memory_delayed() - { - thread_scoped_lock lock(delayed_free_bvh_mutex); - delayed_free_bvh_memory.free_memory(); - } - - void const_copy_to(const char *name, void *host, size_t size) override - { - // Set constant memory for CUDA module - // TODO(pmours): This is only used for tonemapping (see 'film_convert'). - // Could be removed by moving those functions to filter CUDA module. - CUDADevice::const_copy_to(name, host, size); - - if (strcmp(name, "__data") == 0) { - assert(size <= sizeof(KernelData)); - - // Update traversable handle (since it is different for each device on multi devices) - KernelData *const data = (KernelData *)host; - *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; - - update_launch_params(offsetof(KernelParams, data), host, size); - return; - } - - // Update data storage pointers in launch parameters -# define KERNEL_TEX(data_type, tex_name) \ - if (strcmp(name, #tex_name) == 0) { \ - update_launch_params(offsetof(KernelParams, tex_name), host, size); \ - return; \ - } -# include "kernel/kernel_textures.h" -# undef KERNEL_TEX - } - - void update_launch_params(size_t offset, void *data, size_t data_size) - { - const CUDAContextScope scope(cuContext); - - for (int i = 0; i < info.cpu_threads; ++i) - check_result_cuda( - cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset, - data, - data_size)); - } - - void task_add(DeviceTask &task) override - { - // Upload texture information to device if it has changed since last launch - load_texture_info(); - - if (task.type == DeviceTask::FILM_CONVERT) { - // Execute in main thread because of OpenGL access - film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - return; - } - - if (task.type == DeviceTask::DENOISE_BUFFER) { - // Execute denoising in a single thread (e.g. to avoid race conditions during creation) - task_pool.push([=] { - DeviceTask task_copy = task; - thread_run(task_copy, 0); - }); - return; - } - - // Split task into smaller ones - list<DeviceTask> tasks; - task.split(tasks, info.cpu_threads); - - // Queue tasks in internal task pool - int task_index = 0; - for (DeviceTask &task : tasks) { - task_pool.push([=] { - // Using task index parameter instead of thread index, since number of CUDA streams may - // differ from number of threads - DeviceTask task_copy = task; - thread_run(task_copy, task_index); - }); - task_index++; - } - } - - void task_wait() override - { - // Wait for all queued tasks to finish - task_pool.wait_work(); - } - - void task_cancel() override - { - // Cancel any remaining tasks in the internal pool - task_pool.cancel(); - } -}; - -bool device_optix_init() -{ - if (g_optixFunctionTable.optixDeviceContextCreate != NULL) - return true; // Already initialized function table - - // Need to initialize CUDA as well - if (!device_cuda_init()) - return false; - - const OptixResult result = optixInit(); - - if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) { - VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. " - "Please update to the latest driver first!"; - return false; - } - else if (result != OPTIX_SUCCESS) { - VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result; - return false; - } - - // Loaded OptiX successfully! - return true; -} - -void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices) -{ - devices.reserve(cuda_devices.size()); - - // Simply add all supported CUDA devices as OptiX devices again - for (DeviceInfo info : cuda_devices) { - assert(info.type == DEVICE_CUDA); - - int major; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num); - if (major < 5) { - continue; // Only Maxwell and up are supported by OptiX - } - - info.type = DEVICE_OPTIX; - info.id += "_OptiX"; - info.denoisers |= DENOISER_OPTIX; - info.has_branched_path = false; - - devices.push_back(info); - } -} - -Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) -{ - return new OptiXDevice(info, stats, profiler, background); -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp new file mode 100644 index 00000000000..a89ba68d62c --- /dev/null +++ b/intern/cycles/device/device_queue.cpp @@ -0,0 +1,87 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_queue.h" + +#include "util/util_algorithm.h" +#include "util/util_logging.h" +#include "util/util_time.h" + +#include <iomanip> + +CCL_NAMESPACE_BEGIN + +DeviceQueue::DeviceQueue(Device *device) + : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0) +{ + DCHECK_NE(device, nullptr); +} + +DeviceQueue::~DeviceQueue() +{ + if (VLOG_IS_ON(3)) { + /* Print kernel execution times sorted by time. */ + vector<pair<DeviceKernelMask, double>> stats_sorted; + for (const auto &stat : stats_kernel_time_) { + stats_sorted.push_back(stat); + } + + sort(stats_sorted.begin(), + stats_sorted.end(), + [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) { + return a.second > b.second; + }); + + VLOG(3) << "GPU queue stats:"; + for (const auto &[mask, time] : stats_sorted) { + VLOG(3) << " " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5) + << std::right << time << "s: " << device_kernel_mask_as_string(mask); + } + } +} + +void DeviceQueue::debug_init_execution() +{ + if (VLOG_IS_ON(3)) { + last_sync_time_ = time_dt(); + last_kernels_enqueued_ = 0; + } +} + +void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size) +{ + if (VLOG_IS_ON(3)) { + VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size " + << work_size; + last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel); + } +} + +void DeviceQueue::debug_synchronize() +{ + if (VLOG_IS_ON(3)) { + const double new_time = time_dt(); + const double elapsed_time = new_time - last_sync_time_; + VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s"; + + stats_kernel_time_[last_kernels_enqueued_] += elapsed_time; + + last_sync_time_ = new_time; + last_kernels_enqueued_ = 0; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h new file mode 100644 index 00000000000..edda3e61d51 --- /dev/null +++ b/intern/cycles/device/device_queue.h @@ -0,0 +1,113 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/device_kernel.h" + +#include "device/device_graphics_interop.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class device_memory; + +struct KernelWorkTile; + +/* Abstraction of a command queue for a device. + * Provides API to schedule kernel execution in a specific queue with minimal possible overhead + * from driver side. + * + * This class encapsulates all properties needed for commands execution. */ +class DeviceQueue { + public: + virtual ~DeviceQueue(); + + /* Number of concurrent states to process for integrator, + * based on number of cores and/or available memory. */ + virtual int num_concurrent_states(const size_t state_size) const = 0; + + /* Number of states which keeps the device occupied with work without loosing performance. + * The renderer will add more work (when available) when number of active paths falls below this + * value. */ + virtual int num_concurrent_busy_states() const = 0; + + /* Initialize execution of kernels on this queue. + * + * Will, for example, load all data required by the kernels from Device to global or path state. + * + * Use this method after device synchronization has finished before enqueueing any kernels. */ + virtual void init_execution() = 0; + + /* Test if an optional device kernel is available. */ + virtual bool kernel_available(DeviceKernel kernel) const = 0; + + /* Enqueue kernel execution. + * + * Execute the kernel work_size times on the device. + * Supported arguments types: + * - int: pass pointer to the int + * - device memory: pass pointer to device_memory.device_pointer + * Return false if there was an error executing this or a previous kernel. */ + virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0; + + /* Wait unit all enqueued kernels have finished execution. + * Return false if there was an error executing any of the enqueued kernels. */ + virtual bool synchronize() = 0; + + /* Copy memory to/from device as part of the command queue, to ensure + * operations are done in order without having to synchronize. */ + virtual void zero_to_device(device_memory &mem) = 0; + virtual void copy_to_device(device_memory &mem) = 0; + virtual void copy_from_device(device_memory &mem) = 0; + + /* Graphics resources interoperability. + * + * The interoperability comes here by the meaning that the device is capable of computing result + * directly into an OpenGL (or other graphics library) buffer. */ + + /* Create graphics interoperability context which will be taking care of mapping graphics + * resource as a buffer writable by kernels of this device. */ + virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() + { + LOG(FATAL) << "Request of GPU interop of a device which does not support it."; + return nullptr; + } + + /* Device this queue has been created for. */ + Device *device; + + protected: + /* Hide construction so that allocation via `Device` API is enforced. */ + explicit DeviceQueue(Device *device); + + /* Implementations call these from the corresponding methods to generate debugging logs. */ + void debug_init_execution(); + void debug_enqueue(DeviceKernel kernel, const int work_size); + void debug_synchronize(); + + /* Combination of kernels enqueued together sync last synchronize. */ + DeviceKernelMask last_kernels_enqueued_; + /* Time of synchronize call. */ + double last_sync_time_; + /* Accumulated execution time for combinations of kernels launched together. */ + map<DeviceKernelMask, double> stats_kernel_time_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp deleted file mode 100644 index 9889f688aaa..00000000000 --- a/intern/cycles/device/device_split_kernel.cpp +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Copyright 2011-2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "device/device_split_kernel.h" - -#include "kernel/kernel_types.h" -#include "kernel/split/kernel_split_data_types.h" - -#include "util/util_logging.h" -#include "util/util_time.h" - -CCL_NAMESPACE_BEGIN - -static const double alpha = 0.1; /* alpha for rolling average */ - -DeviceSplitKernel::DeviceSplitKernel(Device *device) - : device(device), - split_data(device, "split_data"), - ray_state(device, "ray_state", MEM_READ_WRITE), - queue_index(device, "queue_index"), - use_queues_flag(device, "use_queues_flag"), - work_pool_wgs(device, "work_pool_wgs"), - kernel_data_initialized(false) -{ - avg_time_per_sample = 0.0; - - kernel_path_init = NULL; - kernel_scene_intersect = NULL; - kernel_lamp_emission = NULL; - kernel_do_volume = NULL; - kernel_queue_enqueue = NULL; - kernel_indirect_background = NULL; - kernel_shader_setup = NULL; - kernel_shader_sort = NULL; - kernel_shader_eval = NULL; - kernel_holdout_emission_blurring_pathtermination_ao = NULL; - kernel_subsurface_scatter = NULL; - kernel_direct_lighting = NULL; - kernel_shadow_blocked_ao = NULL; - kernel_shadow_blocked_dl = NULL; - kernel_enqueue_inactive = NULL; - kernel_next_iteration_setup = NULL; - kernel_indirect_subsurface = NULL; - kernel_buffer_update = NULL; - kernel_adaptive_stopping = NULL; - kernel_adaptive_filter_x = NULL; - kernel_adaptive_filter_y = NULL; - kernel_adaptive_adjust_samples = NULL; -} - -DeviceSplitKernel::~DeviceSplitKernel() -{ - split_data.free(); - ray_state.free(); - use_queues_flag.free(); - queue_index.free(); - work_pool_wgs.free(); - - delete kernel_path_init; - delete kernel_scene_intersect; - delete kernel_lamp_emission; - delete kernel_do_volume; - delete kernel_queue_enqueue; - delete kernel_indirect_background; - delete kernel_shader_setup; - delete kernel_shader_sort; - delete kernel_shader_eval; - delete kernel_holdout_emission_blurring_pathtermination_ao; - delete kernel_subsurface_scatter; - delete kernel_direct_lighting; - delete kernel_shadow_blocked_ao; - delete kernel_shadow_blocked_dl; - delete kernel_enqueue_inactive; - delete kernel_next_iteration_setup; - delete kernel_indirect_subsurface; - delete kernel_buffer_update; - delete kernel_adaptive_stopping; - delete kernel_adaptive_filter_x; - delete kernel_adaptive_filter_y; - delete kernel_adaptive_adjust_samples; -} - -bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features) -{ -#define LOAD_KERNEL(name) \ - kernel_##name = get_split_kernel_function(#name, requested_features); \ - if (!kernel_##name) { \ - device->set_error(string("Split kernel error: failed to load kernel_") + #name); \ - return false; \ - } - - LOAD_KERNEL(path_init); - LOAD_KERNEL(scene_intersect); - LOAD_KERNEL(lamp_emission); - if (requested_features.use_volume) { - LOAD_KERNEL(do_volume); - } - LOAD_KERNEL(queue_enqueue); - LOAD_KERNEL(indirect_background); - LOAD_KERNEL(shader_setup); - LOAD_KERNEL(shader_sort); - LOAD_KERNEL(shader_eval); - LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); - LOAD_KERNEL(subsurface_scatter); - LOAD_KERNEL(direct_lighting); - LOAD_KERNEL(shadow_blocked_ao); - LOAD_KERNEL(shadow_blocked_dl); - LOAD_KERNEL(enqueue_inactive); - LOAD_KERNEL(next_iteration_setup); - LOAD_KERNEL(indirect_subsurface); - LOAD_KERNEL(buffer_update); - LOAD_KERNEL(adaptive_stopping); - LOAD_KERNEL(adaptive_filter_x); - LOAD_KERNEL(adaptive_filter_y); - LOAD_KERNEL(adaptive_adjust_samples); - -#undef LOAD_KERNEL - - /* Re-initialiaze kernel-dependent data when kernels change. */ - kernel_data_initialized = false; - - return true; -} - -size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg, - device_memory &data, - uint64_t max_buffer_size) -{ - uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024; - VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element) - << " bytes. (" << string_human_readable_size(size_per_element) << ")."; - return max_buffer_size / size_per_element; -} - -bool DeviceSplitKernel::path_trace(DeviceTask &task, - RenderTile &tile, - device_memory &kgbuffer, - device_memory &kernel_data) -{ - if (device->have_error()) { - return false; - } - - /* Allocate all required global memory once. */ - if (!kernel_data_initialized) { - kernel_data_initialized = true; - - /* Set local size */ - int2 lsize = split_kernel_local_size(); - local_size[0] = lsize[0]; - local_size[1] = lsize[1]; - - /* Set global size */ - int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task); - - /* Make sure that set work size is a multiple of local - * work size dimensions. - */ - global_size[0] = round_up(gsize[0], local_size[0]); - global_size[1] = round_up(gsize[1], local_size[1]); - - int num_global_elements = global_size[0] * global_size[1]; - assert(num_global_elements % WORK_POOL_SIZE == 0); - - /* Calculate max groups */ - - /* Denotes the maximum work groups possible w.r.t. current requested tile size. */ - unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU : - WORK_POOL_SIZE_GPU; - unsigned int max_work_groups = num_global_elements / work_pool_size + 1; - - /* Allocate work_pool_wgs memory. */ - work_pool_wgs.alloc_to_device(max_work_groups); - queue_index.alloc_to_device(NUM_QUEUES); - use_queues_flag.alloc_to_device(1); - split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements)); - ray_state.alloc(num_global_elements); - } - - /* Number of elements in the global state buffer */ - int num_global_elements = global_size[0] * global_size[1]; - -#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \ - if (device->have_error()) { \ - return false; \ - } \ - if (!kernel_##name->enqueue( \ - KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \ - return false; \ - } - - tile.sample = tile.start_sample; - - /* for exponential increase between tile updates */ - int time_multiplier = 1; - - while (tile.sample < tile.start_sample + tile.num_samples) { - /* to keep track of how long it takes to run a number of samples */ - double start_time = time_dt(); - - /* initial guess to start rolling average */ - const int initial_num_samples = 1; - /* approx number of samples per second */ - const int samples_per_second = (avg_time_per_sample > 0.0) ? - int(double(time_multiplier) / avg_time_per_sample) + 1 : - initial_num_samples; - - RenderTile subtile = tile; - subtile.start_sample = tile.sample; - subtile.num_samples = samples_per_second; - - if (task.adaptive_sampling.use) { - subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample, - subtile.num_samples); - } - - /* Don't go beyond requested number of samples. */ - subtile.num_samples = min(subtile.num_samples, - tile.start_sample + tile.num_samples - tile.sample); - - if (device->have_error()) { - return false; - } - - /* reset state memory here as global size for data_init - * kernel might not be large enough to do in kernel - */ - work_pool_wgs.zero_to_device(); - split_data.zero_to_device(); - ray_state.zero_to_device(); - - if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), - subtile, - num_global_elements, - kgbuffer, - kernel_data, - split_data, - ray_state, - queue_index, - use_queues_flag, - work_pool_wgs)) { - return false; - } - - ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size); - - bool activeRaysAvailable = true; - double cancel_time = DBL_MAX; - - while (activeRaysAvailable) { - /* Do path-iteration in host [Enqueue Path-iteration kernels. */ - for (int PathIter = 0; PathIter < 16; PathIter++) { - ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); - if (kernel_do_volume) { - ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size); - } - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); - ENQUEUE_SPLIT_KERNEL( - holdout_emission_blurring_pathtermination_ao, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size); - - if (task.get_cancel() && cancel_time == DBL_MAX) { - /* Wait up to twice as many seconds for current samples to finish - * to avoid artifacts in render result from ending too soon. - */ - cancel_time = time_dt() + 2.0 * time_multiplier; - } - - if (time_dt() > cancel_time) { - return true; - } - } - - /* Decide if we should exit path-iteration in host. */ - ray_state.copy_from_device(0, global_size[0] * global_size[1], 1); - - activeRaysAvailable = false; - - for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { - if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) { - if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) { - /* Something went wrong, abort to avoid looping endlessly. */ - device->set_error("Split kernel error: invalid ray state"); - return false; - } - - /* Not all rays are RAY_INACTIVE. */ - activeRaysAvailable = true; - break; - } - } - - if (time_dt() > cancel_time) { - return true; - } - } - - int filter_sample = tile.sample + subtile.num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - size_t buffer_size[2]; - buffer_size[0] = round_up(tile.w, local_size[0]); - buffer_size[1] = round_up(tile.h, local_size[1]); - kernel_adaptive_stopping->enqueue( - KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); - buffer_size[0] = round_up(tile.h, local_size[0]); - buffer_size[1] = round_up(1, local_size[1]); - kernel_adaptive_filter_x->enqueue( - KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); - buffer_size[0] = round_up(tile.w, local_size[0]); - buffer_size[1] = round_up(1, local_size[1]); - kernel_adaptive_filter_y->enqueue( - KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); - } - - double time_per_sample = ((time_dt() - start_time) / subtile.num_samples); - - if (avg_time_per_sample == 0.0) { - /* start rolling average */ - avg_time_per_sample = time_per_sample; - } - else { - avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample; - } - -#undef ENQUEUE_SPLIT_KERNEL - - tile.sample += subtile.num_samples; - task.update_progress(&tile, tile.w * tile.h * subtile.num_samples); - - time_multiplier = min(time_multiplier << 1, 10); - - if (task.get_cancel()) { - return true; - } - } - - if (task.adaptive_sampling.use) { - /* Reset the start samples. */ - RenderTile subtile = tile; - subtile.start_sample = tile.start_sample; - subtile.num_samples = tile.sample - tile.start_sample; - enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), - subtile, - num_global_elements, - kgbuffer, - kernel_data, - split_data, - ray_state, - queue_index, - use_queues_flag, - work_pool_wgs); - size_t buffer_size[2]; - buffer_size[0] = round_up(tile.w, local_size[0]); - buffer_size[1] = round_up(tile.h, local_size[1]); - kernel_adaptive_adjust_samples->enqueue( - KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); - } - - return true; -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h deleted file mode 100644 index 07a21b10299..00000000000 --- a/intern/cycles/device/device_split_kernel.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright 2011-2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DEVICE_SPLIT_KERNEL_H__ -#define __DEVICE_SPLIT_KERNEL_H__ - -#include "device/device.h" -#include "render/buffers.h" - -CCL_NAMESPACE_BEGIN - -/* When allocate global memory in chunks. We may not be able to - * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks; - * Since some bytes may be needed for aligning chunks of memory; - * This is the amount of memory that we dedicate for that purpose. - */ -#define DATA_ALLOCATION_MEM_FACTOR 5000000 // 5MB - -/* Types used for split kernel */ - -class KernelDimensions { - public: - size_t global_size[2]; - size_t local_size[2]; - - KernelDimensions(size_t global_size_[2], size_t local_size_[2]) - { - memcpy(global_size, global_size_, sizeof(global_size)); - memcpy(local_size, local_size_, sizeof(local_size)); - } -}; - -class SplitKernelFunction { - public: - virtual ~SplitKernelFunction() - { - } - - /* enqueue the kernel, returns false if there is an error */ - virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0; -}; - -class DeviceSplitKernel { - private: - Device *device; - - SplitKernelFunction *kernel_path_init; - SplitKernelFunction *kernel_scene_intersect; - SplitKernelFunction *kernel_lamp_emission; - SplitKernelFunction *kernel_do_volume; - SplitKernelFunction *kernel_queue_enqueue; - SplitKernelFunction *kernel_indirect_background; - SplitKernelFunction *kernel_shader_setup; - SplitKernelFunction *kernel_shader_sort; - SplitKernelFunction *kernel_shader_eval; - SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao; - SplitKernelFunction *kernel_subsurface_scatter; - SplitKernelFunction *kernel_direct_lighting; - SplitKernelFunction *kernel_shadow_blocked_ao; - SplitKernelFunction *kernel_shadow_blocked_dl; - SplitKernelFunction *kernel_enqueue_inactive; - SplitKernelFunction *kernel_next_iteration_setup; - SplitKernelFunction *kernel_indirect_subsurface; - SplitKernelFunction *kernel_buffer_update; - SplitKernelFunction *kernel_adaptive_stopping; - SplitKernelFunction *kernel_adaptive_filter_x; - SplitKernelFunction *kernel_adaptive_filter_y; - SplitKernelFunction *kernel_adaptive_adjust_samples; - - /* Global memory variables [porting]; These memory is used for - * co-operation between different kernels; Data written by one - * kernel will be available to another kernel via this global - * memory. - */ - device_only_memory<uchar> split_data; - device_vector<uchar> ray_state; - device_only_memory<int> - queue_index; /* Array of size num_queues that tracks the size of each queue. */ - - /* Flag to make sceneintersect and lampemission kernel use queues. */ - device_only_memory<char> use_queues_flag; - - /* Approximate time it takes to complete one sample */ - double avg_time_per_sample; - - /* Work pool with respect to each work group. */ - device_only_memory<unsigned int> work_pool_wgs; - - /* Cached kernel-dependent data, initialized once. */ - bool kernel_data_initialized; - size_t local_size[2]; - size_t global_size[2]; - - public: - explicit DeviceSplitKernel(Device *device); - virtual ~DeviceSplitKernel(); - - bool load_kernels(const DeviceRequestedFeatures &requested_features); - bool path_trace(DeviceTask &task, - RenderTile &rtile, - device_memory &kgbuffer, - device_memory &kernel_data); - - virtual uint64_t state_buffer_size(device_memory &kg, - device_memory &data, - size_t num_threads) = 0; - size_t max_elements_for_max_buffer_size(device_memory &kg, - device_memory &data, - uint64_t max_buffer_size); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &kernel_data_, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs) = 0; - - virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &) = 0; - virtual int2 split_kernel_local_size() = 0; - virtual int2 split_kernel_global_size(device_memory &kg, - device_memory &data, - DeviceTask &task) = 0; -}; - -CCL_NAMESPACE_END - -#endif /* __DEVICE_SPLIT_KERNEL_H__ */ diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp deleted file mode 100644 index 55fbaa31e42..00000000000 --- a/intern/cycles/device/device_task.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <stdlib.h> -#include <string.h> - -#include "device/device_task.h" - -#include "render/buffers.h" - -#include "util/util_algorithm.h" -#include "util/util_time.h" - -CCL_NAMESPACE_BEGIN - -/* Device Task */ - -DeviceTask::DeviceTask(Type type_) - : type(type_), - x(0), - y(0), - w(0), - h(0), - rgba_byte(0), - rgba_half(0), - buffer(0), - sample(0), - num_samples(1), - shader_input(0), - shader_output(0), - shader_eval_type(0), - shader_filter(0), - shader_x(0), - shader_w(0), - buffers(nullptr), - tile_types(0), - denoising_from_render(false), - pass_stride(0), - frame_stride(0), - target_pass_stride(0), - pass_denoising_data(0), - pass_denoising_clean(0), - need_finish_queue(false), - integrator_branched(false) -{ - last_update_time = time_dt(); -} - -int DeviceTask::get_subtask_count(int num, int max_size) const -{ - if (max_size != 0) { - int max_size_num; - - if (type == SHADER) { - max_size_num = (shader_w + max_size - 1) / max_size; - } - else { - max_size = max(1, max_size / w); - max_size_num = (h + max_size - 1) / max_size; - } - - num = max(max_size_num, num); - } - - if (type == SHADER) { - num = min(shader_w, num); - } - else if (type == RENDER) { - } - else { - num = min(h, num); - } - - return num; -} - -void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const -{ - num = get_subtask_count(num, max_size); - - if (type == SHADER) { - for (int i = 0; i < num; i++) { - int tx = shader_x + (shader_w / num) * i; - int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num; - - DeviceTask task = *this; - - task.shader_x = tx; - task.shader_w = tw; - - tasks.push_back(task); - } - } - else if (type == RENDER) { - for (int i = 0; i < num; i++) - tasks.push_back(*this); - } - else { - for (int i = 0; i < num; i++) { - int ty = y + (h / num) * i; - int th = (i == num - 1) ? h - i * (h / num) : h / num; - - DeviceTask task = *this; - - task.y = ty; - task.h = th; - - tasks.push_back(task); - } - } -} - -void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples) -{ - if (type == FILM_CONVERT) - return; - - if (update_progress_sample) { - if (pixel_samples == -1) { - pixel_samples = shader_w; - } - update_progress_sample(pixel_samples, rtile ? rtile->sample : 0); - } - - if (update_tile_sample) { - double current_time = time_dt(); - - if (current_time - last_update_time >= 1.0) { - update_tile_sample(*rtile); - - last_update_time = current_time; - } - } -} - -/* Adaptive Sampling */ - -AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0) -{ -} - -/* Render samples in steps that align with the adaptive filtering. */ -int AdaptiveSampling::align_samples(int sample, int num_samples) const -{ - int end_sample = sample + num_samples; - - /* Round down end sample to the nearest sample that needs filtering. */ - end_sample &= ~(adaptive_step - 1); - - if (end_sample <= sample) { - /* In order to reach the next sample that needs filtering, we'd need - * to increase num_samples. We don't do that in this function, so - * just keep it as is and don't filter this time around. */ - return num_samples; - } - return end_sample - sample; -} - -bool AdaptiveSampling::need_filter(int sample) const -{ - if (sample > min_samples) { - return (sample & (adaptive_step - 1)) == (adaptive_step - 1); - } - else { - return false; - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h deleted file mode 100644 index 3f7cf47b692..00000000000 --- a/intern/cycles/device/device_task.h +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DEVICE_TASK_H__ -#define __DEVICE_TASK_H__ - -#include "device/device_memory.h" - -#include "util/util_function.h" -#include "util/util_list.h" - -CCL_NAMESPACE_BEGIN - -/* Device Task */ - -class Device; -class RenderBuffers; -class RenderTile; -class RenderTileNeighbors; -class Tile; - -enum DenoiserType { - DENOISER_NLM = 1, - DENOISER_OPTIX = 2, - DENOISER_OPENIMAGEDENOISE = 4, - DENOISER_NUM, - - DENOISER_NONE = 0, - DENOISER_ALL = ~0, -}; - -enum DenoiserInput { - DENOISER_INPUT_RGB = 1, - DENOISER_INPUT_RGB_ALBEDO = 2, - DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3, - - DENOISER_INPUT_NUM, -}; - -typedef int DenoiserTypeMask; - -class DenoiseParams { - public: - /* Apply denoiser to image. */ - bool use; - /* Output denoising data passes (possibly without applying the denoiser). */ - bool store_passes; - - /* Denoiser type. */ - DenoiserType type; - - /* Viewport start sample. */ - int start_sample; - - /** Native Denoiser. */ - - /* Pixel radius for neighboring pixels to take into account. */ - int radius; - /* Controls neighbor pixel weighting for the denoising filter. */ - float strength; - /* Preserve more or less detail based on feature passes. */ - float feature_strength; - /* When removing pixels that don't carry information, - * use a relative threshold instead of an absolute one. */ - bool relative_pca; - /* How many frames before and after the current center frame are included. */ - int neighbor_frames; - /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */ - bool clamp_input; - - /** OIDN/Optix Denoiser. */ - - /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */ - DenoiserInput input_passes; - - DenoiseParams() - { - use = false; - store_passes = false; - - type = DENOISER_NLM; - - radius = 8; - strength = 0.5f; - feature_strength = 0.5f; - relative_pca = false; - neighbor_frames = 2; - clamp_input = true; - - /* Default to color + albedo only, since normal input does not always have the desired effect - * when denoising with OptiX. */ - input_passes = DENOISER_INPUT_RGB_ALBEDO; - - start_sample = 0; - } - - /* Test if a denoising task needs to run, also to prefilter passes for the native - * denoiser when we are not applying denoising to the combined image. */ - bool need_denoising_task() const - { - return (use || (store_passes && type == DENOISER_NLM)); - } -}; - -class AdaptiveSampling { - public: - AdaptiveSampling(); - - int align_samples(int sample, int num_samples) const; - bool need_filter(int sample) const; - - bool use; - int adaptive_step; - int min_samples; -}; - -class DeviceTask { - public: - typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type; - Type type; - - int x, y, w, h; - device_ptr rgba_byte; - device_ptr rgba_half; - device_ptr buffer; - int sample; - int num_samples; - int offset, stride; - - device_ptr shader_input; - device_ptr shader_output; - int shader_eval_type; - int shader_filter; - int shader_x, shader_w; - - RenderBuffers *buffers; - - explicit DeviceTask(Type type = RENDER); - - int get_subtask_count(int num, int max_size = 0) const; - void split(list<DeviceTask> &tasks, int num, int max_size = 0) const; - - void update_progress(RenderTile *rtile, int pixel_samples = -1); - - function<bool(Device *device, RenderTile &, uint)> acquire_tile; - function<void(long, int)> update_progress_sample; - function<void(RenderTile &)> update_tile_sample; - function<void(RenderTile &)> release_tile; - function<bool()> get_cancel; - function<bool()> get_tile_stolen; - function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles; - function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles; - - uint tile_types; - DenoiseParams denoising; - bool denoising_from_render; - vector<int> denoising_frames; - - int pass_stride; - int frame_stride; - int target_pass_stride; - int pass_denoising_data; - int pass_denoising_clean; - - bool need_finish_queue; - bool integrator_branched; - AdaptiveSampling adaptive_sampling; - - protected: - double last_update_time; -}; - -CCL_NAMESPACE_END - -#endif /* __DEVICE_TASK_H__ */ diff --git a/intern/cycles/device/device_dummy.cpp b/intern/cycles/device/dummy/device.cpp index 5112fc152e5..678276ed025 100644 --- a/intern/cycles/device/device_dummy.cpp +++ b/intern/cycles/device/dummy/device.cpp @@ -14,8 +14,10 @@ * limitations under the License. */ +#include "device/dummy/device.h" + #include "device/device.h" -#include "device/device_intern.h" +#include "device/device_queue.h" CCL_NAMESPACE_BEGIN @@ -23,8 +25,8 @@ CCL_NAMESPACE_BEGIN class DummyDevice : public Device { public: - DummyDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) - : Device(info_, stats_, profiler_, background_) + DummyDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_) + : Device(info_, stats_, profiler_) { error_msg = info.error_msg; } @@ -61,23 +63,11 @@ class DummyDevice : public Device { virtual void const_copy_to(const char *, void *, size_t) override { } - - virtual void task_add(DeviceTask &) override - { - } - - virtual void task_wait() override - { - } - - virtual void task_cancel() override - { - } }; -Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) +Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) { - return new DummyDevice(info, stats, profiler, background); + return new DummyDevice(info, stats, profiler); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/dummy/device.h b/intern/cycles/device/dummy/device.h new file mode 100644 index 00000000000..832a9568129 --- /dev/null +++ b/intern/cycles/device/dummy/device.h @@ -0,0 +1,31 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_string.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp new file mode 100644 index 00000000000..6dbcce2d9a5 --- /dev/null +++ b/intern/cycles/device/multi/device.cpp @@ -0,0 +1,423 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/multi/device.h" + +#include <sstream> +#include <stdlib.h> + +#include "bvh/bvh_multi.h" + +#include "device/device.h" +#include "device/device_queue.h" + +#include "render/buffers.h" +#include "render/geometry.h" + +#include "util/util_foreach.h" +#include "util/util_list.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_time.h" + +CCL_NAMESPACE_BEGIN + +class MultiDevice : public Device { + public: + struct SubDevice { + Stats stats; + Device *device; + map<device_ptr, device_ptr> ptr_map; + int peer_island_index = -1; + }; + + list<SubDevice> devices; + device_ptr unique_key; + vector<vector<SubDevice *>> peer_islands; + + MultiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : Device(info, stats, profiler), unique_key(1) + { + foreach (const DeviceInfo &subinfo, info.multi_devices) { + /* Always add CPU devices at the back since GPU devices can change + * host memory pointers, which CPU uses as device pointer. */ + SubDevice *sub; + if (subinfo.type == DEVICE_CPU) { + devices.emplace_back(); + sub = &devices.back(); + } + else { + devices.emplace_front(); + sub = &devices.front(); + } + + /* The pointer to 'sub->stats' will stay valid even after new devices + * are added, since 'devices' is a linked list. */ + sub->device = Device::create(subinfo, sub->stats, profiler); + } + + /* Build a list of peer islands for the available render devices */ + foreach (SubDevice &sub, devices) { + /* First ensure that every device is in at least once peer island */ + if (sub.peer_island_index < 0) { + peer_islands.emplace_back(); + sub.peer_island_index = (int)peer_islands.size() - 1; + peer_islands[sub.peer_island_index].push_back(&sub); + } + + if (!info.has_peer_memory) { + continue; + } + + /* Second check peer access between devices and fill up the islands accordingly */ + foreach (SubDevice &peer_sub, devices) { + if (peer_sub.peer_island_index < 0 && + peer_sub.device->info.type == sub.device->info.type && + peer_sub.device->check_peer_access(sub.device)) { + peer_sub.peer_island_index = sub.peer_island_index; + peer_islands[sub.peer_island_index].push_back(&peer_sub); + } + } + } + } + + ~MultiDevice() + { + foreach (SubDevice &sub, devices) + delete sub.device; + } + + const string &error_message() override + { + error_msg.clear(); + + foreach (SubDevice &sub, devices) + error_msg += sub.device->error_message(); + + return error_msg; + } + + virtual bool show_samples() const override + { + if (devices.size() > 1) { + return false; + } + return devices.front().device->show_samples(); + } + + virtual BVHLayoutMask get_bvh_layout_mask() const override + { + BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL; + BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE; + foreach (const SubDevice &sub_device, devices) { + BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask(); + bvh_layout_mask &= device_bvh_layout_mask; + bvh_layout_mask_all |= device_bvh_layout_mask; + } + + /* With multiple OptiX devices, every device needs its own acceleration structure */ + if (bvh_layout_mask == BVH_LAYOUT_OPTIX) { + return BVH_LAYOUT_MULTI_OPTIX; + } + + /* When devices do not share a common BVH layout, fall back to creating one for each */ + const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE); + if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) { + return BVH_LAYOUT_MULTI_OPTIX_EMBREE; + } + + return bvh_layout_mask; + } + + bool load_kernels(const uint kernel_features) override + { + foreach (SubDevice &sub, devices) + if (!sub.device->load_kernels(kernel_features)) + return false; + + return true; + } + + void build_bvh(BVH *bvh, Progress &progress, bool refit) override + { + /* Try to build and share a single acceleration structure, if possible */ + if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) { + devices.back().device->build_bvh(bvh, progress, refit); + return; + } + + assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX || + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE); + + BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh); + bvh_multi->sub_bvhs.resize(devices.size()); + + vector<BVHMulti *> geom_bvhs; + geom_bvhs.reserve(bvh->geometry.size()); + foreach (Geometry *geom, bvh->geometry) { + geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh)); + } + + /* Broadcast acceleration structure build to all render devices */ + size_t i = 0; + foreach (SubDevice &sub, devices) { + /* Change geometry BVH pointers to the sub BVH */ + for (size_t k = 0; k < bvh->geometry.size(); ++k) { + bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i]; + } + + if (!bvh_multi->sub_bvhs[i]) { + BVHParams params = bvh->params; + if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX) + params.bvh_layout = BVH_LAYOUT_OPTIX; + else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) + params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX : + BVH_LAYOUT_EMBREE; + + /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree + * (since they are put into the top level directly, see bvh_embree.cpp) */ + if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE && + !bvh->geometry[0]->is_instanced()) { + i++; + continue; + } + + bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device); + } + + sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit); + i++; + } + + /* Change geometry BVH pointers back to the multi BVH. */ + for (size_t k = 0; k < bvh->geometry.size(); ++k) { + bvh->geometry[k]->bvh = geom_bvhs[k]; + } + } + + virtual void *get_cpu_osl_memory() override + { + if (devices.size() > 1) { + return NULL; + } + return devices.front().device->get_cpu_osl_memory(); + } + + bool is_resident(device_ptr key, Device *sub_device) override + { + foreach (SubDevice &sub, devices) { + if (sub.device == sub_device) { + return find_matching_mem_device(key, sub)->device == sub_device; + } + } + return false; + } + + SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub) + { + assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end())); + + /* Get the memory owner of this key (first try current device, then peer devices) */ + SubDevice *owner_sub = ⊂ + if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) { + foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) { + if (island_sub != owner_sub && + island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) { + owner_sub = island_sub; + } + } + } + return owner_sub; + } + + SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island) + { + assert(!island.empty()); + + /* Get the memory owner of this key or the device with the lowest memory usage when new */ + SubDevice *owner_sub = island.front(); + foreach (SubDevice *island_sub, island) { + if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) : + (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) { + owner_sub = island_sub; + } + } + return owner_sub; + } + + inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub) + { + return find_matching_mem_device(key, sub)->ptr_map[key]; + } + + void mem_alloc(device_memory &mem) override + { + device_ptr key = unique_key++; + + assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || mem.type == MEM_DEVICE_ONLY); + /* The remaining memory types can be distributed across devices */ + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_suitable_mem_device(key, island); + mem.device = owner_sub->device; + mem.device_pointer = 0; + mem.device_size = 0; + + owner_sub->device->mem_alloc(mem); + owner_sub->ptr_map[key] = mem.device_pointer; + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size); + } + + void mem_copy_to(device_memory &mem) override + { + device_ptr existing_key = mem.device_pointer; + device_ptr key = (existing_key) ? existing_key : unique_key++; + size_t existing_size = mem.device_size; + + /* The tile buffers are allocated on each device (see below), so copy to all of them */ + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); + mem.device = owner_sub->device; + mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + owner_sub->device->mem_copy_to(mem); + owner_sub->ptr_map[key] = mem.device_pointer; + + if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) { + /* Need to create texture objects and update pointer in kernel globals on all devices */ + foreach (SubDevice *island_sub, island) { + if (island_sub != owner_sub) { + island_sub->device->mem_copy_to(mem); + } + } + } + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size - existing_size); + } + + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override + { + device_ptr key = mem.device_pointer; + int i = 0, sub_h = h / devices.size(); + + foreach (SubDevice &sub, devices) { + int sy = y + i * sub_h; + int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h; + + SubDevice *owner_sub = find_matching_mem_device(key, sub); + mem.device = owner_sub->device; + mem.device_pointer = owner_sub->ptr_map[key]; + + owner_sub->device->mem_copy_from(mem, sy, w, sh, elem); + i++; + } + + mem.device = this; + mem.device_pointer = key; + } + + void mem_zero(device_memory &mem) override + { + device_ptr existing_key = mem.device_pointer; + device_ptr key = (existing_key) ? existing_key : unique_key++; + size_t existing_size = mem.device_size; + + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); + mem.device = owner_sub->device; + mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + owner_sub->device->mem_zero(mem); + owner_sub->ptr_map[key] = mem.device_pointer; + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size - existing_size); + } + + void mem_free(device_memory &mem) override + { + device_ptr key = mem.device_pointer; + size_t existing_size = mem.device_size; + + /* Free memory that was allocated for all devices (see above) on each device */ + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_matching_mem_device(key, *island.front()); + mem.device = owner_sub->device; + mem.device_pointer = owner_sub->ptr_map[key]; + mem.device_size = existing_size; + + owner_sub->device->mem_free(mem); + owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key)); + + if (mem.type == MEM_TEXTURE) { + /* Free texture objects on all devices */ + foreach (SubDevice *island_sub, island) { + if (island_sub != owner_sub) { + island_sub->device->mem_free(mem); + } + } + } + } + + mem.device = this; + mem.device_pointer = 0; + mem.device_size = 0; + stats.mem_free(existing_size); + } + + void const_copy_to(const char *name, void *host, size_t size) override + { + foreach (SubDevice &sub, devices) + sub.device->const_copy_to(name, host, size); + } + + int device_number(Device *sub_device) override + { + int i = 0; + + foreach (SubDevice &sub, devices) { + if (sub.device == sub_device) + return i; + i++; + } + + return -1; + } + + virtual void foreach_device(const function<void(Device *)> &callback) override + { + foreach (SubDevice &sub, devices) { + sub.device->foreach_device(callback); + } + } +}; + +Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ + return new MultiDevice(info, stats, profiler); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/multi/device.h b/intern/cycles/device/multi/device.h new file mode 100644 index 00000000000..6e121014a1f --- /dev/null +++ b/intern/cycles/device/multi/device.h @@ -0,0 +1,31 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_string.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h deleted file mode 100644 index a65e764b0d4..00000000000 --- a/intern/cycles/device/opencl/device_opencl.h +++ /dev/null @@ -1,658 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -# include "device/device.h" -# include "device/device_denoising.h" -# include "device/device_split_kernel.h" - -# include "util/util_map.h" -# include "util/util_param.h" -# include "util/util_string.h" -# include "util/util_task.h" - -# include "clew.h" - -# include "device/opencl/memory_manager.h" - -CCL_NAMESPACE_BEGIN - -/* Disable workarounds, seems to be working fine on latest drivers. */ -# define CYCLES_DISABLE_DRIVER_WORKAROUNDS - -/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workarounds for testing. */ -# ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS -/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */ -# undef clEnqueueNDRangeKernel -# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); - -# undef clEnqueueWriteBuffer -# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); - -# undef clEnqueueReadBuffer -# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); -# endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ - -# define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) - -struct OpenCLPlatformDevice { - OpenCLPlatformDevice(cl_platform_id platform_id, - const string &platform_name, - cl_device_id device_id, - cl_device_type device_type, - const string &device_name, - const string &hardware_id, - const string &device_extensions) - : platform_id(platform_id), - platform_name(platform_name), - device_id(device_id), - device_type(device_type), - device_name(device_name), - hardware_id(hardware_id), - device_extensions(device_extensions) - { - } - cl_platform_id platform_id; - string platform_name; - cl_device_id device_id; - cl_device_type device_type; - string device_name; - string hardware_id; - string device_extensions; -}; - -/* Contains all static OpenCL helper functions. */ -class OpenCLInfo { - public: - static cl_device_type device_type(); - static bool use_debug(); - static bool device_supported(const string &platform_name, const cl_device_id device_id); - static bool platform_version_check(cl_platform_id platform, string *error = NULL); - static bool device_version_check(cl_device_id device, string *error = NULL); - static bool get_device_version(cl_device_id device, - int *r_major, - int *r_minor, - string *error = NULL); - static string get_hardware_id(const string &platform_name, cl_device_id device_id); - static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices); - - /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */ - - /* Platform information. */ - static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL); - static cl_uint get_num_platforms(); - - static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL); - static vector<cl_platform_id> get_platforms(); - - static bool get_platform_name(cl_platform_id platform_id, string *platform_name); - static string get_platform_name(cl_platform_id platform_id); - - static bool get_num_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - cl_uint *num_devices, - cl_int *error = NULL); - static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type); - - static bool get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - vector<cl_device_id> *device_ids, - cl_int *error = NULL); - static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type); - - /* Device information. */ - static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL); - - static string get_device_name(cl_device_id device_id); - - static bool get_device_extensions(cl_device_id device_id, - string *device_extensions, - cl_int *error = NULL); - - static string get_device_extensions(cl_device_id device_id); - - static bool get_device_type(cl_device_id device_id, - cl_device_type *device_type, - cl_int *error = NULL); - static cl_device_type get_device_type(cl_device_id device_id); - - static bool get_driver_version(cl_device_id device_id, - int *major, - int *minor, - cl_int *error = NULL); - - static int mem_sub_ptr_alignment(cl_device_id device_id); - - /* Get somewhat more readable device name. - * Main difference is AMD OpenCL here which only gives code name - * for the regular device name. This will give more sane device - * name using some extensions. - */ - static string get_readable_device_name(cl_device_id device_id); -}; - -/* Thread safe cache for contexts and programs. - */ -class OpenCLCache { - struct Slot { - struct ProgramEntry { - ProgramEntry(); - ProgramEntry(const ProgramEntry &rhs); - ~ProgramEntry(); - cl_program program; - thread_mutex *mutex; - }; - - Slot(); - Slot(const Slot &rhs); - ~Slot(); - - thread_mutex *context_mutex; - cl_context context; - typedef map<ustring, ProgramEntry> EntryMap; - EntryMap programs; - }; - - /* key is combination of platform ID and device ID */ - typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair; - - /* map of Slot objects */ - typedef map<PlatformDevicePair, Slot> CacheMap; - CacheMap cache; - - /* MD5 hash of the kernel source. */ - string kernel_md5; - - thread_mutex cache_lock; - thread_mutex kernel_md5_lock; - - /* lazy instantiate */ - static OpenCLCache &global_instance(); - - public: - enum ProgramName { - OCL_DEV_BASE_PROGRAM, - OCL_DEV_MEGAKERNEL_PROGRAM, - }; - - /* Lookup context in the cache. If this returns NULL, slot_locker - * will be holding a lock for the cache. slot_locker should refer to a - * default constructed thread_scoped_lock. */ - static cl_context get_context(cl_platform_id platform, - cl_device_id device, - thread_scoped_lock &slot_locker); - /* Same as above. */ - static cl_program get_program(cl_platform_id platform, - cl_device_id device, - ustring key, - thread_scoped_lock &slot_locker); - - /* Store context in the cache. You MUST have tried to get the item before storing to it. */ - static void store_context(cl_platform_id platform, - cl_device_id device, - cl_context context, - thread_scoped_lock &slot_locker); - /* Same as above. */ - static void store_program(cl_platform_id platform, - cl_device_id device, - cl_program program, - ustring key, - thread_scoped_lock &slot_locker); - - static string get_kernel_md5(); -}; - -# define opencl_device_assert(device, stmt) \ - { \ - cl_int err = stmt; \ -\ - if (err != CL_SUCCESS) { \ - string message = string_printf( \ - "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ - if ((device)->error_message() == "") { \ - (device)->set_error(message); \ - } \ - fprintf(stderr, "%s\n", message.c_str()); \ - } \ - } \ - (void)0 - -# define opencl_assert(stmt) \ - { \ - cl_int err = stmt; \ -\ - if (err != CL_SUCCESS) { \ - string message = string_printf( \ - "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ - if (error_msg == "") { \ - error_msg = message; \ - } \ - fprintf(stderr, "%s\n", message.c_str()); \ - } \ - } \ - (void)0 - -class OpenCLDevice : public Device { - public: - DedicatedTaskPool task_pool; - - /* Task pool for required kernels (base, AO kernels during foreground rendering) */ - TaskPool load_required_kernel_task_pool; - /* Task pool for optional kernels (feature kernels during foreground rendering) */ - TaskPool load_kernel_task_pool; - std::atomic<int> load_kernel_num_compiling; - - cl_context cxContext; - cl_command_queue cqCommandQueue; - cl_platform_id cpPlatform; - cl_device_id cdDevice; - cl_int ciErr; - int device_num; - - class OpenCLProgram { - public: - OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL) - { - } - OpenCLProgram(OpenCLDevice *device, - const string &program_name, - const string &kernel_name, - const string &kernel_build_options, - bool use_stdout = true); - ~OpenCLProgram(); - - void add_kernel(ustring name); - - /* Try to load the program from device cache or disk */ - bool load(); - /* Compile the kernel (first separate, fail-back to local). */ - void compile(); - /* Create the OpenCL kernels after loading or compiling */ - void create_kernels(); - - bool is_loaded() const - { - return loaded; - } - const string &get_log() const - { - return log; - } - void report_error(); - - /* Wait until this kernel is available to be used - * It will return true when the kernel is available. - * It will return false when the kernel is not available - * or could not be loaded. */ - bool wait_for_availability(); - - cl_kernel operator()(); - cl_kernel operator()(ustring name); - - void release(); - - private: - bool build_kernel(const string *debug_src); - /* Build the program by calling the own process. - * This is required for multithreaded OpenCL compilation, since most Frameworks serialize - * build calls internally if they come from the same process. - * If that is not supported, this function just returns false. - */ - bool compile_separate(const string &clbin); - /* Build the program by calling OpenCL directly. */ - bool compile_kernel(const string *debug_src); - /* Loading and saving the program from/to disk. */ - bool load_binary(const string &clbin, const string *debug_src = NULL); - bool save_binary(const string &clbin); - - void add_log(const string &msg, bool is_debug); - void add_error(const string &msg); - - bool loaded; - bool needs_compiling; - - cl_program program; - OpenCLDevice *device; - - /* Used for the OpenCLCache key. */ - string program_name; - - string kernel_file, kernel_build_options, device_md5; - - bool use_stdout; - string log, error_msg; - string compile_output; - - map<ustring, cl_kernel> kernels; - }; - - /* Container for all types of split programs. */ - class OpenCLSplitPrograms { - public: - OpenCLDevice *device; - OpenCLProgram program_split; - OpenCLProgram program_lamp_emission; - OpenCLProgram program_do_volume; - OpenCLProgram program_indirect_background; - OpenCLProgram program_shader_eval; - OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; - OpenCLProgram program_subsurface_scatter; - OpenCLProgram program_direct_lighting; - OpenCLProgram program_shadow_blocked_ao; - OpenCLProgram program_shadow_blocked_dl; - - OpenCLSplitPrograms(OpenCLDevice *device); - ~OpenCLSplitPrograms(); - - /* Load the kernels and put the created kernels in the given - * `programs` parameter. */ - void load_kernels(vector<OpenCLProgram *> &programs, - const DeviceRequestedFeatures &requested_features); - }; - - DeviceSplitKernel *split_kernel; - - OpenCLProgram base_program; - OpenCLProgram bake_program; - OpenCLProgram displace_program; - OpenCLProgram background_program; - OpenCLProgram denoising_program; - - OpenCLSplitPrograms kernel_programs; - - typedef map<string, device_vector<uchar> *> ConstMemMap; - typedef map<string, device_ptr> MemMap; - - ConstMemMap const_mem_map; - MemMap mem_map; - - bool device_initialized; - string platform_name; - string device_name; - - bool opencl_error(cl_int err); - void opencl_error(const string &message); - void opencl_assert_err(cl_int err, const char *where); - - OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); - ~OpenCLDevice(); - - static void CL_CALLBACK context_notify_callback(const char *err_info, - const void * /*private_info*/, - size_t /*cb*/, - void *user_data); - - bool opencl_version_check(); - OpenCLSplitPrograms *get_split_programs(); - - string device_md5_hash(string kernel_custom_build_options = ""); - bool load_kernels(const DeviceRequestedFeatures &requested_features); - void load_required_kernels(const DeviceRequestedFeatures &requested_features); - - bool wait_for_availability(const DeviceRequestedFeatures &requested_features); - DeviceKernelStatus get_active_kernel_switch_state(); - - /* Get the name of the opencl program for the given kernel */ - const string get_opencl_program_name(const string &kernel_name); - /* Get the program file name to compile (*.cl) for the given kernel */ - const string get_opencl_program_filename(const string &kernel_name); - string get_build_options(const DeviceRequestedFeatures &requested_features, - const string &opencl_program_name); - /* Enable the default features to reduce recompilation events */ - void enable_default_features(DeviceRequestedFeatures &features); - - void mem_alloc(device_memory &mem); - void mem_copy_to(device_memory &mem); - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem); - void mem_zero(device_memory &mem); - void mem_free(device_memory &mem); - - int mem_sub_ptr_alignment(); - - void const_copy_to(const char *name, void *host, size_t size); - void global_alloc(device_memory &mem); - void global_free(device_memory &mem); - void tex_alloc(device_texture &mem); - void tex_free(device_texture &mem); - - size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, - size_t w, - size_t h, - bool x_workgroups = false, - size_t max_workgroup_size = -1); - void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); - void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); - - void film_convert(DeviceTask &task, - device_ptr buffer, - device_ptr rgba_byte, - device_ptr rgba_half); - void shader(DeviceTask &task); - void update_adaptive(DeviceTask &task, RenderTile &tile, int sample); - void bake(DeviceTask &task, RenderTile &tile); - - void denoise(RenderTile &tile, DenoisingTask &denoising); - - int get_split_task_count(DeviceTask & /*task*/) - { - return 1; - } - - void task_add(DeviceTask &task) - { - task_pool.push([=] { - DeviceTask task_copy = task; - thread_run(task_copy); - }); - } - - void task_wait() - { - task_pool.wait(); - } - - void task_cancel() - { - task_pool.cancel(); - } - - void thread_run(DeviceTask &task); - - virtual BVHLayoutMask get_bvh_layout_mask() const - { - return BVH_LAYOUT_BVH2; - } - - virtual bool show_samples() const - { - return true; - } - - protected: - string kernel_build_options(const string *debug_src = NULL); - - void mem_zero_kernel(device_ptr ptr, size_t size); - - bool denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task); - bool denoising_construct_transform(DenoisingTask *task); - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task); - bool denoising_solve(device_ptr output_ptr, DenoisingTask *task); - bool denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task); - bool denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task); - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task); - bool denoising_write_feature(int to_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task); - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task); - - device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size); - void mem_free_sub_ptr(device_ptr ptr); - - class ArgumentWrapper { - public: - ArgumentWrapper() : size(0), pointer(NULL) - { - } - - ArgumentWrapper(device_memory &argument) - : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) - { - } - - template<typename T> - ArgumentWrapper(device_vector<T> &argument) - : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) - { - } - - template<typename T> - ArgumentWrapper(device_only_memory<T> &argument) - : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) - { - } - template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument) - { - } - - ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value) - { - } - - ArgumentWrapper(float argument) - : size(sizeof(float)), float_value(argument), pointer(&float_value) - { - } - - size_t size; - int int_value; - float float_value; - void *pointer; - }; - - /* TODO(sergey): In the future we can use variadic templates, once - * C++0x is allowed. Should allow to clean this up a bit. - */ - int kernel_set_args(cl_kernel kernel, - int start_argument_index, - const ArgumentWrapper &arg1 = ArgumentWrapper(), - const ArgumentWrapper &arg2 = ArgumentWrapper(), - const ArgumentWrapper &arg3 = ArgumentWrapper(), - const ArgumentWrapper &arg4 = ArgumentWrapper(), - const ArgumentWrapper &arg5 = ArgumentWrapper(), - const ArgumentWrapper &arg6 = ArgumentWrapper(), - const ArgumentWrapper &arg7 = ArgumentWrapper(), - const ArgumentWrapper &arg8 = ArgumentWrapper(), - const ArgumentWrapper &arg9 = ArgumentWrapper(), - const ArgumentWrapper &arg10 = ArgumentWrapper(), - const ArgumentWrapper &arg11 = ArgumentWrapper(), - const ArgumentWrapper &arg12 = ArgumentWrapper(), - const ArgumentWrapper &arg13 = ArgumentWrapper(), - const ArgumentWrapper &arg14 = ArgumentWrapper(), - const ArgumentWrapper &arg15 = ArgumentWrapper(), - const ArgumentWrapper &arg16 = ArgumentWrapper(), - const ArgumentWrapper &arg17 = ArgumentWrapper(), - const ArgumentWrapper &arg18 = ArgumentWrapper(), - const ArgumentWrapper &arg19 = ArgumentWrapper(), - const ArgumentWrapper &arg20 = ArgumentWrapper(), - const ArgumentWrapper &arg21 = ArgumentWrapper(), - const ArgumentWrapper &arg22 = ArgumentWrapper(), - const ArgumentWrapper &arg23 = ArgumentWrapper(), - const ArgumentWrapper &arg24 = ArgumentWrapper(), - const ArgumentWrapper &arg25 = ArgumentWrapper(), - const ArgumentWrapper &arg26 = ArgumentWrapper(), - const ArgumentWrapper &arg27 = ArgumentWrapper(), - const ArgumentWrapper &arg28 = ArgumentWrapper(), - const ArgumentWrapper &arg29 = ArgumentWrapper(), - const ArgumentWrapper &arg30 = ArgumentWrapper(), - const ArgumentWrapper &arg31 = ArgumentWrapper(), - const ArgumentWrapper &arg32 = ArgumentWrapper(), - const ArgumentWrapper &arg33 = ArgumentWrapper()); - - void release_kernel_safe(cl_kernel kernel); - void release_mem_object_safe(cl_mem mem); - void release_program_safe(cl_program program); - - /* ** Those guys are for working around some compiler-specific bugs ** */ - - cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker); - - void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker); - - private: - MemoryManager memory_manager; - friend class MemoryManager; - - static_assert_align(TextureInfo, 16); - device_vector<TextureInfo> texture_info; - - typedef map<string, device_memory *> TexturesMap; - TexturesMap textures; - - bool textures_need_update; - - protected: - void flush_texture_buffers(); - - friend class OpenCLSplitKernel; - friend class OpenCLSplitKernelFunction; -}; - -Device *opencl_create_split_device(DeviceInfo &info, - Stats &stats, - Profiler &profiler, - bool background); - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp deleted file mode 100644 index 31a2265700c..00000000000 --- a/intern/cycles/device/opencl/device_opencl_impl.cpp +++ /dev/null @@ -1,2113 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -# include "device/opencl/device_opencl.h" - -# include "kernel/kernel_types.h" -# include "kernel/split/kernel_split_data_types.h" - -# include "util/util_algorithm.h" -# include "util/util_debug.h" -# include "util/util_foreach.h" -# include "util/util_logging.h" -# include "util/util_md5.h" -# include "util/util_path.h" -# include "util/util_time.h" - -CCL_NAMESPACE_BEGIN - -struct texture_slot_t { - texture_slot_t(const string &name, int slot) : name(name), slot(slot) - { - } - string name; - int slot; -}; - -static const string NON_SPLIT_KERNELS = - "denoising " - "base " - "background " - "displace "; - -static const string SPLIT_BUNDLE_KERNELS = - "data_init " - "path_init " - "state_buffer_size " - "scene_intersect " - "queue_enqueue " - "shader_setup " - "shader_sort " - "enqueue_inactive " - "next_iteration_setup " - "indirect_subsurface " - "buffer_update " - "adaptive_stopping " - "adaptive_filter_x " - "adaptive_filter_y " - "adaptive_adjust_samples"; - -const string OpenCLDevice::get_opencl_program_name(const string &kernel_name) -{ - if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) { - return kernel_name; - } - else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { - return "split_bundle"; - } - else { - return "split_" + kernel_name; - } -} - -const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name) -{ - if (kernel_name == "denoising") { - return "filter.cl"; - } - else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { - return "kernel_split_bundle.cl"; - } - else { - return "kernel_" + kernel_name + ".cl"; - } -} - -/* Enable features that we always want to compile to reduce recompilation events */ -void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features) -{ - features.use_transparent = true; - features.use_shadow_tricks = true; - features.use_principled = true; - features.use_denoising = true; - - if (!background) { - features.max_nodes_group = NODE_GROUP_LEVEL_MAX; - features.nodes_features = NODE_FEATURE_ALL; - features.use_hair = true; - features.use_subsurface = true; - features.use_camera_motion = false; - features.use_object_motion = false; - } -} - -string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features, - const string &opencl_program_name) -{ - /* first check for non-split kernel programs */ - if (opencl_program_name == "base" || opencl_program_name == "denoising") { - return ""; - } - else if (opencl_program_name == "bake") { - /* Note: get_build_options for bake is only requested when baking is enabled. - * displace and background are always requested. - * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_denoising = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_hair = true; - features.use_subsurface = true; - features.max_nodes_group = NODE_GROUP_LEVEL_MAX; - features.nodes_features = NODE_FEATURE_ALL; - features.use_integrator_branched = false; - return features.get_build_options(); - } - else if (opencl_program_name == "displace") { - /* As displacement does not use any nodes from the Shading group (eg BSDF). - * We disable all features that are related to shading. */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_denoising = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_baking = false; - features.use_transparent = false; - features.use_shadow_tricks = false; - features.use_subsurface = false; - features.use_volume = false; - features.nodes_features &= ~NODE_FEATURE_VOLUME; - features.use_denoising = false; - features.use_principled = false; - features.use_integrator_branched = false; - return features.get_build_options(); - } - else if (opencl_program_name == "background") { - /* Background uses Background shading - * It is save to disable shadow features, subsurface and volumetric. */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_baking = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_transparent = false; - features.use_shadow_tricks = false; - features.use_denoising = false; - /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node. - * Perhaps we should remove them in UI as it does not make any sense when - * rendering background. */ - features.nodes_features &= ~NODE_FEATURE_VOLUME; - features.use_subsurface = false; - features.use_volume = false; - features.use_shader_raytrace = false; - features.use_patch_evaluation = false; - features.use_integrator_branched = false; - return features.get_build_options(); - } - - string build_options = "-D__SPLIT_KERNEL__ "; - /* Set compute device build option. */ - cl_device_type device_type; - OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr); - assert(this->ciErr == CL_SUCCESS); - if (device_type == CL_DEVICE_TYPE_GPU) { - build_options += "-D__COMPUTE_DEVICE_GPU__ "; - } - - DeviceRequestedFeatures nofeatures; - enable_default_features(nofeatures); - - /* Add program specific optimized compile directives */ - if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) { - build_options += nofeatures.get_build_options(); - } - else { - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - - /* Always turn off baking at this point. Baking is only useful when building the bake kernel. - * this also makes sure that the kernels that are build during baking can be reused - * when not doing any baking. */ - features.use_baking = false; - - /* Do not vary on shaders when program doesn't do any shading. - * We have bundled them in a single program. */ - if (opencl_program_name == "split_bundle") { - features.max_nodes_group = 0; - features.nodes_features = 0; - features.use_shader_raytrace = false; - } - - /* No specific settings, just add the regular ones */ - build_options += features.get_build_options(); - } - - return build_options; -} - -OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_) -{ - device = device_; -} - -OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms() -{ - program_split.release(); - program_lamp_emission.release(); - program_do_volume.release(); - program_indirect_background.release(); - program_shader_eval.release(); - program_holdout_emission_blurring_pathtermination_ao.release(); - program_subsurface_scatter.release(); - program_direct_lighting.release(); - program_shadow_blocked_ao.release(); - program_shadow_blocked_dl.release(); -} - -void OpenCLDevice::OpenCLSplitPrograms::load_kernels( - vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features) -{ - if (!requested_features.use_baking) { -# define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \ - program_split.add_kernel(ustring("path_trace_" #kernel_name)); -# define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \ - const string program_name_##kernel_name = "split_" #kernel_name; \ - program_##kernel_name = OpenCLDevice::OpenCLProgram( \ - device, \ - program_name_##kernel_name, \ - "kernel_" #kernel_name ".cl", \ - device->get_build_options(requested_features, program_name_##kernel_name)); \ - program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \ - programs.push_back(&program_##kernel_name); - - /* Ordered with most complex kernels first, to reduce overall compile time. */ - ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter); - ADD_SPLIT_KERNEL_PROGRAM(direct_lighting); - ADD_SPLIT_KERNEL_PROGRAM(indirect_background); - if (requested_features.use_volume) { - ADD_SPLIT_KERNEL_PROGRAM(do_volume); - } - ADD_SPLIT_KERNEL_PROGRAM(shader_eval); - ADD_SPLIT_KERNEL_PROGRAM(lamp_emission); - ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao); - ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl); - ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao); - - /* Quick kernels bundled in a single program to reduce overhead of starting - * Blender processes. */ - program_split = OpenCLDevice::OpenCLProgram( - device, - "split_bundle", - "kernel_split_bundle.cl", - device->get_build_options(requested_features, "split_bundle")); - - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples); - programs.push_back(&program_split); - -# undef ADD_SPLIT_KERNEL_PROGRAM -# undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM - } -} - -namespace { - -/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to - * fetch its size. - */ -typedef struct KernelGlobalsDummy { - ccl_constant KernelData *data; - ccl_global char *buffers[8]; - -# define KERNEL_TEX(type, name) TextureInfo name; -# include "kernel/kernel_textures.h" -# undef KERNEL_TEX - SplitData split_data; - SplitParams split_param_data; -} KernelGlobalsDummy; - -} // namespace - -struct CachedSplitMemory { - int id; - device_memory *split_data; - device_memory *ray_state; - device_memory *queue_index; - device_memory *use_queues_flag; - device_memory *work_pools; - device_ptr *buffer; -}; - -class OpenCLSplitKernelFunction : public SplitKernelFunction { - public: - OpenCLDevice *device; - OpenCLDevice::OpenCLProgram program; - CachedSplitMemory &cached_memory; - int cached_id; - - OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory) - : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1) - { - } - - ~OpenCLSplitKernelFunction() - { - program.release(); - } - - virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) - { - if (cached_id != cached_memory.id) { - cl_uint start_arg_index = device->kernel_set_args( - program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state); - - device->set_kernel_arg_buffers(program(), &start_arg_index); - - start_arg_index += device->kernel_set_args(program(), - start_arg_index, - *cached_memory.queue_index, - *cached_memory.use_queues_flag, - *cached_memory.work_pools, - *cached_memory.buffer); - - cached_id = cached_memory.id; - } - - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - program(), - 2, - NULL, - dim.global_size, - dim.local_size, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - if (device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return false; - } - - return true; - } -}; - -class OpenCLSplitKernel : public DeviceSplitKernel { - OpenCLDevice *device; - CachedSplitMemory cached_memory; - - public: - explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) - { - } - - virtual SplitKernelFunction *get_split_kernel_function( - const string &kernel_name, const DeviceRequestedFeatures &requested_features) - { - OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory); - - const string program_name = device->get_opencl_program_name(kernel_name); - kernel->program = OpenCLDevice::OpenCLProgram( - device, - program_name, - device->get_opencl_program_filename(kernel_name), - device->get_build_options(requested_features, program_name)); - - kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); - kernel->program.load(); - - if (!kernel->program.is_loaded()) { - delete kernel; - return NULL; - } - - return kernel; - } - - virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads) - { - device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); - size_buffer.alloc(1); - size_buffer.zero_to_device(); - - uint threads = num_threads; - OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); - cl_kernel kernel_state_buffer_size = programs->program_split( - ustring("path_trace_state_buffer_size")); - device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer); - - size_t global_size = 64; - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - kernel_state_buffer_size, - 1, - NULL, - &global_size, - NULL, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - size_buffer.copy_from_device(0, 1, 1); - size_t size = size_buffer[0]; - size_buffer.free(); - - if (device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return 0; - } - - return size; - } - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &kernel_data, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs) - { - cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; - - /* Set the range of samples to be processed for every ray in - * path-regeneration logic. - */ - cl_int start_sample = rtile.start_sample; - cl_int end_sample = rtile.start_sample + rtile.num_samples; - - OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); - cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init")); - - cl_uint start_arg_index = device->kernel_set_args(kernel_data_init, - 0, - kernel_globals, - kernel_data, - split_data, - num_global_elements, - ray_state); - - device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index); - - start_arg_index += device->kernel_set_args(kernel_data_init, - start_arg_index, - start_sample, - end_sample, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - queue_index, - dQueue_size, - use_queues_flag, - work_pool_wgs, - rtile.num_samples, - rtile.buffer); - - /* Enqueue ckPathTraceKernel_data_init kernel. */ - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - kernel_data_init, - 2, - NULL, - dim.global_size, - dim.local_size, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - if (device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return false; - } - - cached_memory.split_data = &split_data; - cached_memory.ray_state = &ray_state; - cached_memory.queue_index = &queue_index; - cached_memory.use_queues_flag = &use_queues_flag; - cached_memory.work_pools = &work_pool_wgs; - cached_memory.buffer = &rtile.buffer; - cached_memory.id++; - - return true; - } - - virtual int2 split_kernel_local_size() - { - return make_int2(64, 1); - } - - virtual int2 split_kernel_global_size(device_memory &kg, - device_memory &data, - DeviceTask & /*task*/) - { - cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); - /* Use small global size on CPU devices as it seems to be much faster. */ - if (type == CL_DEVICE_TYPE_CPU) { - VLOG(1) << "Global size: (64, 64)."; - return make_int2(64, 64); - } - - cl_ulong max_buffer_size; - clGetDeviceInfo( - device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); - - if (DebugFlags().opencl.mem_limit) { - max_buffer_size = min(max_buffer_size, - cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); - } - - VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) - << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; - - /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ - max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024); - - size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); - int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), - (int)sqrt(num_elements)); - - if (device->info.description.find("Intel") != string::npos) { - global_size = make_int2(min(512, global_size.x), min(512, global_size.y)); - } - - VLOG(1) << "Global size: " << global_size << "."; - return global_size; - } -}; - -bool OpenCLDevice::opencl_error(cl_int err) -{ - if (err != CL_SUCCESS) { - string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err)); - if (error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - return true; - } - - return false; -} - -void OpenCLDevice::opencl_error(const string &message) -{ - if (error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); -} - -void OpenCLDevice::opencl_assert_err(cl_int err, const char *where) -{ - if (err != CL_SUCCESS) { - string message = string_printf( - "OpenCL error (%d): %s in %s", err, clewErrorString(err), where); - if (error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); -# ifndef NDEBUG - abort(); -# endif - } -} - -OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) - : Device(info, stats, profiler, background), - load_kernel_num_compiling(0), - kernel_programs(this), - memory_manager(this), - texture_info(this, "__texture_info", MEM_GLOBAL) -{ - cpPlatform = NULL; - cdDevice = NULL; - cxContext = NULL; - cqCommandQueue = NULL; - device_initialized = false; - textures_need_update = true; - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - if (usable_devices.size() == 0) { - opencl_error("OpenCL: no devices found."); - return; - } - assert(info.num < usable_devices.size()); - OpenCLPlatformDevice &platform_device = usable_devices[info.num]; - device_num = info.num; - cpPlatform = platform_device.platform_id; - cdDevice = platform_device.device_id; - platform_name = platform_device.platform_name; - device_name = platform_device.device_name; - VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device " - << device_name << "."; - - { - /* try to use cached context */ - thread_scoped_lock cache_locker; - cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker); - - if (cxContext == NULL) { - /* create context properties array to specify platform */ - const cl_context_properties context_props[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0}; - - /* create context */ - cxContext = clCreateContext( - context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr); - - if (opencl_error(ciErr)) { - opencl_error("OpenCL: clCreateContext failed"); - return; - } - - /* cache it */ - OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker); - } - } - - cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); - if (opencl_error(ciErr)) { - opencl_error("OpenCL: Error creating command queue"); - return; - } - - /* Allocate this right away so that texture_info - * is placed at offset 0 in the device memory buffers. */ - texture_info.resize(1); - memory_manager.alloc("texture_info", texture_info); - - device_initialized = true; - - split_kernel = new OpenCLSplitKernel(this); -} - -OpenCLDevice::~OpenCLDevice() -{ - task_pool.cancel(); - load_required_kernel_task_pool.cancel(); - load_kernel_task_pool.cancel(); - - memory_manager.free(); - - ConstMemMap::iterator mt; - for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { - delete mt->second; - } - - base_program.release(); - bake_program.release(); - displace_program.release(); - background_program.release(); - denoising_program.release(); - - if (cqCommandQueue) - clReleaseCommandQueue(cqCommandQueue); - if (cxContext) - clReleaseContext(cxContext); - - delete split_kernel; -} - -void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info, - const void * /*private_info*/, - size_t /*cb*/, - void *user_data) -{ - string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); - fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); -} - -bool OpenCLDevice::opencl_version_check() -{ - string error; - if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) { - opencl_error(error); - return false; - } - if (!OpenCLInfo::device_version_check(cdDevice, &error)) { - opencl_error(error); - return false; - } - return true; -} - -string OpenCLDevice::device_md5_hash(string kernel_custom_build_options) -{ - MD5Hash md5; - char version[256], driver[256], name[256], vendor[256]; - - clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL); - clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL); - - md5.append((uint8_t *)vendor, strlen(vendor)); - md5.append((uint8_t *)version, strlen(version)); - md5.append((uint8_t *)name, strlen(name)); - md5.append((uint8_t *)driver, strlen(driver)); - - string options = kernel_build_options(); - options += kernel_custom_build_options; - md5.append((uint8_t *)options.c_str(), options.size()); - - return md5.get_hex(); -} - -bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features) -{ - VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << "."; - /* Verify if device was initialized. */ - if (!device_initialized) { - fprintf(stderr, "OpenCL: failed to initialize device.\n"); - return false; - } - - /* Verify we have right opencl version. */ - if (!opencl_version_check()) - return false; - - load_required_kernels(requested_features); - - vector<OpenCLProgram *> programs; - kernel_programs.load_kernels(programs, requested_features); - - if (!requested_features.use_baking && requested_features.use_denoising) { - denoising_program = OpenCLProgram( - this, "denoising", "filter.cl", get_build_options(requested_features, "denoising")); - denoising_program.add_kernel(ustring("filter_divide_shadow")); - denoising_program.add_kernel(ustring("filter_get_feature")); - denoising_program.add_kernel(ustring("filter_write_feature")); - denoising_program.add_kernel(ustring("filter_detect_outliers")); - denoising_program.add_kernel(ustring("filter_combine_halves")); - denoising_program.add_kernel(ustring("filter_construct_transform")); - denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); - denoising_program.add_kernel(ustring("filter_nlm_blur")); - denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); - denoising_program.add_kernel(ustring("filter_nlm_update_output")); - denoising_program.add_kernel(ustring("filter_nlm_normalize")); - denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); - denoising_program.add_kernel(ustring("filter_finalize")); - programs.push_back(&denoising_program); - } - - load_required_kernel_task_pool.wait_work(); - - /* Parallel compilation of Cycles kernels, this launches multiple - * processes to workaround OpenCL frameworks serializing the calls - * internally within a single process. */ - foreach (OpenCLProgram *program, programs) { - if (!program->load()) { - load_kernel_num_compiling++; - load_kernel_task_pool.push([=] { - program->compile(); - load_kernel_num_compiling--; - }); - } - } - return true; -} - -void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features) -{ - vector<OpenCLProgram *> programs; - base_program = OpenCLProgram( - this, "base", "kernel_base.cl", get_build_options(requested_features, "base")); - base_program.add_kernel(ustring("convert_to_byte")); - base_program.add_kernel(ustring("convert_to_half_float")); - base_program.add_kernel(ustring("zero_buffer")); - programs.push_back(&base_program); - - if (requested_features.use_true_displacement) { - displace_program = OpenCLProgram( - this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace")); - displace_program.add_kernel(ustring("displace")); - programs.push_back(&displace_program); - } - - if (requested_features.use_background_light) { - background_program = OpenCLProgram(this, - "background", - "kernel_background.cl", - get_build_options(requested_features, "background")); - background_program.add_kernel(ustring("background")); - programs.push_back(&background_program); - } - - if (requested_features.use_baking) { - bake_program = OpenCLProgram( - this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake")); - bake_program.add_kernel(ustring("bake")); - programs.push_back(&bake_program); - } - - foreach (OpenCLProgram *program, programs) { - if (!program->load()) { - load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); - } - } -} - -bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features) -{ - if (requested_features.use_baking) { - /* For baking, kernels have already been loaded in load_required_kernels(). */ - return true; - } - - load_kernel_task_pool.wait_work(); - return split_kernel->load_kernels(requested_features); -} - -OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs() -{ - return &kernel_programs; -} - -DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state() -{ - return DEVICE_KERNEL_USING_FEATURE_KERNEL; -} - -void OpenCLDevice::mem_alloc(device_memory &mem) -{ - if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - size_t size = mem.memory_size(); - - /* check there is enough memory available for the allocation */ - cl_ulong max_alloc_size = 0; - clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); - - if (DebugFlags().opencl.mem_limit) { - max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); - } - - if (size > max_alloc_size) { - string error = "Scene too complex to fit in available memory."; - if (mem.name != NULL) { - error += string_printf(" (allocating buffer %s failed.)", mem.name); - } - set_error(error); - - return; - } - - cl_mem_flags mem_flag; - void *mem_ptr = NULL; - - if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) - mem_flag = CL_MEM_READ_ONLY; - else - mem_flag = CL_MEM_READ_WRITE; - - /* Zero-size allocation might be invoked by render, but not really - * supported by OpenCL. Using NULL as device pointer also doesn't really - * work for some reason, so for the time being we'll use special case - * will null_mem buffer. - */ - if (size != 0) { - mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer"); - } - else { - mem.device_pointer = 0; - } - - stats.mem_alloc(size); - mem.device_size = size; -} - -void OpenCLDevice::mem_copy_to(device_memory &mem) -{ - if (mem.type == MEM_GLOBAL) { - global_free(mem); - global_alloc(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - tex_alloc((device_texture &)mem); - } - else { - if (!mem.device_pointer) { - mem_alloc(mem); - } - - /* this is blocking */ - size_t size = mem.memory_size(); - if (size != 0) { - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - size, - mem.host_pointer, - 0, - NULL, - NULL)); - } - } -} - -void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) -{ - size_t offset = elem * y * w; - size_t size = elem * w * h; - assert(size != 0); - opencl_assert(clEnqueueReadBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - offset, - size, - (uchar *)mem.host_pointer + offset, - 0, - NULL, - NULL)); -} - -void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size) -{ - base_program.wait_for_availability(); - cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - - size_t global_size[] = {1024, 1024}; - size_t num_threads = global_size[0] * global_size[1]; - - cl_mem d_buffer = CL_MEM_PTR(mem); - cl_ulong d_offset = 0; - cl_ulong d_size = 0; - - while (d_offset < size) { - d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset); - - kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); - - ciErr = clEnqueueNDRangeKernel( - cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL); - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); - - d_offset += d_size; - } -} - -void OpenCLDevice::mem_zero(device_memory &mem) -{ - if (!mem.device_pointer) { - mem_alloc(mem); - } - - if (mem.device_pointer) { - if (base_program.is_loaded()) { - mem_zero_kernel(mem.device_pointer, mem.memory_size()); - } - - if (mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } - - if (!base_program.is_loaded()) { - void *zero = mem.host_pointer; - - if (!mem.host_pointer) { - zero = util_aligned_malloc(mem.memory_size(), 16); - memset(zero, 0, mem.memory_size()); - } - - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - mem.memory_size(), - zero, - 0, - NULL, - NULL)); - - if (!mem.host_pointer) { - util_aligned_free(zero); - } - } - } -} - -void OpenCLDevice::mem_free(device_memory &mem) -{ - if (mem.type == MEM_GLOBAL) { - global_free(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - } - else { - if (mem.device_pointer) { - if (mem.device_pointer != 0) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); - } - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } -} - -int OpenCLDevice::mem_sub_ptr_alignment() -{ - return OpenCLInfo::mem_sub_ptr_alignment(cdDevice); -} - -device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size) -{ - cl_mem_flags mem_flag; - if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) - mem_flag = CL_MEM_READ_ONLY; - else - mem_flag = CL_MEM_READ_WRITE; - - cl_buffer_region info; - info.origin = mem.memory_elements_size(offset); - info.size = mem.memory_elements_size(size); - - device_ptr sub_buf = (device_ptr)clCreateSubBuffer( - CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr); - opencl_assert_err(ciErr, "clCreateSubBuffer"); - return sub_buf; -} - -void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer) -{ - if (device_pointer != 0) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); - } -} - -void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size) -{ - ConstMemMap::iterator i = const_mem_map.find(name); - device_vector<uchar> *data; - - if (i == const_mem_map.end()) { - data = new device_vector<uchar>(this, name, MEM_READ_ONLY); - data->alloc(size); - const_mem_map.insert(ConstMemMap::value_type(name, data)); - } - else { - data = i->second; - } - - memcpy(data->data(), host, size); - data->copy_to_device(); -} - -void OpenCLDevice::global_alloc(device_memory &mem) -{ - VLOG(1) << "Global memory allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - memory_manager.alloc(mem.name, mem); - /* Set the pointer to non-null to keep code that inspects its value from thinking its - * unallocated. */ - mem.device_pointer = 1; - textures[mem.name] = &mem; - textures_need_update = true; -} - -void OpenCLDevice::global_free(device_memory &mem) -{ - if (mem.device_pointer) { - mem.device_pointer = 0; - - if (memory_manager.free(mem)) { - textures_need_update = true; - } - - foreach (TexturesMap::value_type &value, textures) { - if (value.second == &mem) { - textures.erase(value.first); - break; - } - } - } -} - -void OpenCLDevice::tex_alloc(device_texture &mem) -{ - VLOG(1) << "Texture allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - memory_manager.alloc(mem.name, mem); - /* Set the pointer to non-null to keep code that inspects its value from thinking its - * unallocated. */ - mem.device_pointer = 1; - textures[mem.name] = &mem; - textures_need_update = true; -} - -void OpenCLDevice::tex_free(device_texture &mem) -{ - global_free(mem); -} - -size_t OpenCLDevice::global_size_round_up(int group_size, int global_size) -{ - int r = global_size % group_size; - return global_size + ((r == 0) ? 0 : group_size - r); -} - -void OpenCLDevice::enqueue_kernel( - cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) -{ - size_t workgroup_size, max_work_items[3]; - - clGetKernelWorkGroupInfo( - kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); - clGetDeviceInfo( - cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL); - - if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { - workgroup_size = max_workgroup_size; - } - - /* Try to divide evenly over 2 dimensions. */ - size_t local_size[2]; - if (x_workgroups) { - local_size[0] = workgroup_size; - local_size[1] = 1; - } - else { - size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); - local_size[0] = local_size[1] = sqrt_workgroup_size; - } - - /* Some implementations have max size 1 on 2nd dimension. */ - if (local_size[1] > max_work_items[1]) { - local_size[0] = workgroup_size / max_work_items[1]; - local_size[1] = max_work_items[1]; - } - - size_t global_size[2] = {global_size_round_up(local_size[0], w), - global_size_round_up(local_size[1], h)}; - - /* Vertical size of 1 is coming from bake/shade kernels where we should - * not round anything up because otherwise we'll either be doing too - * much work per pixel (if we don't check global ID on Y axis) or will - * be checking for global ID to always have Y of 0. - */ - if (h == 1) { - global_size[h] = 1; - } - - /* run kernel */ - opencl_assert( - clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); - opencl_assert(clFlush(cqCommandQueue)); -} - -void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name) -{ - cl_mem ptr; - - MemMap::iterator i = mem_map.find(name); - if (i != mem_map.end()) { - ptr = CL_MEM_PTR(i->second); - } - else { - ptr = 0; - } - - opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr)); -} - -void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) -{ - flush_texture_buffers(); - - memory_manager.set_kernel_arg_buffers(kernel, narg); -} - -void OpenCLDevice::flush_texture_buffers() -{ - if (!textures_need_update) { - return; - } - textures_need_update = false; - - /* Setup slots for textures. */ - int num_slots = 0; - - vector<texture_slot_t> texture_slots; - -# define KERNEL_TEX(type, name) \ - if (textures.find(#name) != textures.end()) { \ - texture_slots.push_back(texture_slot_t(#name, num_slots)); \ - } \ - num_slots++; -# include "kernel/kernel_textures.h" - - int num_data_slots = num_slots; - - foreach (TexturesMap::value_type &tex, textures) { - string name = tex.first; - device_memory *mem = tex.second; - - if (mem->type == MEM_TEXTURE) { - const uint id = ((device_texture *)mem)->slot; - texture_slots.push_back(texture_slot_t(name, num_data_slots + id)); - num_slots = max(num_slots, num_data_slots + id + 1); - } - } - - /* Realloc texture descriptors buffer. */ - memory_manager.free(texture_info); - texture_info.resize(num_slots); - memory_manager.alloc("texture_info", texture_info); - - /* Fill in descriptors */ - foreach (texture_slot_t &slot, texture_slots) { - device_memory *mem = textures[slot.name]; - TextureInfo &info = texture_info[slot.slot]; - - MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); - - if (mem->type == MEM_TEXTURE) { - info = ((device_texture *)mem)->info; - } - else { - memset(&info, 0, sizeof(TextureInfo)); - } - - info.data = desc.offset; - info.cl_buffer = desc.device_buffer; - } - - /* Force write of descriptors. */ - memory_manager.free(texture_info); - memory_manager.alloc("texture_info", texture_info); -} - -void OpenCLDevice::thread_run(DeviceTask &task) -{ - flush_texture_buffers(); - - if (task.type == DeviceTask::RENDER) { - RenderTile tile; - DenoisingTask denoising(this, task); - - /* Allocate buffer for kernel globals */ - device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); - kgbuffer.alloc_to_device(1); - - /* Keep rendering tiles until done. */ - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - assert(tile.task == RenderTile::PATH_TRACE); - scoped_timer timer(&tile.buffers->render_time); - - split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]); - - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - } - else if (tile.task == RenderTile::BAKE) { - bake(task, tile); - } - else if (tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - denoise(tile, denoising); - task.update_progress(&tile, tile.w * tile.h); - } - - task.release_tile(tile); - } - - kgbuffer.free(); - } - else if (task.type == DeviceTask::SHADER) { - shader(task); - } - else if (task.type == DeviceTask::FILM_CONVERT) { - film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - } - else if (task.type == DeviceTask::DENOISE_BUFFER) { - RenderTile tile; - tile.x = task.x; - tile.y = task.y; - tile.w = task.w; - tile.h = task.h; - tile.buffer = task.buffer; - tile.sample = task.sample + task.num_samples; - tile.num_samples = task.num_samples; - tile.start_sample = task.sample; - tile.offset = task.offset; - tile.stride = task.stride; - tile.buffers = task.buffers; - - DenoisingTask denoising(this, task); - denoise(tile, denoising); - task.update_progress(&tile, tile.w * tile.h); - } -} - -void OpenCLDevice::film_convert(DeviceTask &task, - device_ptr buffer, - device_ptr rgba_byte, - device_ptr rgba_half) -{ - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half); - cl_mem d_buffer = CL_MEM_PTR(buffer); - cl_int d_x = task.x; - cl_int d_y = task.y; - cl_int d_w = task.w; - cl_int d_h = task.h; - cl_float d_sample_scale = 1.0f / (task.sample + 1); - cl_int d_offset = task.offset; - cl_int d_stride = task.stride; - - cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) : - base_program(ustring("convert_to_half_float")); - - cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer); - - set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); - - start_arg_index += kernel_set_args(ckFilmConvertKernel, - start_arg_index, - d_sample_scale, - d_x, - d_y, - d_w, - d_h, - d_offset, - d_stride); - - enqueue_kernel(ckFilmConvertKernel, d_w, d_h); -} - -bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task) -{ - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; - - device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts); - device_sub_ptr blurDifference( - task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts); - device_sub_ptr weightAccum( - task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride); - cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum); - cl_mem difference_mem = CL_MEM_PTR(*difference); - cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); - - cl_mem image_mem = CL_MEM_PTR(image_ptr); - cl_mem guide_mem = CL_MEM_PTR(guide_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem out_mem = CL_MEM_PTR(out_ptr); - cl_mem scale_mem = NULL; - - mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride); - mem_zero_kernel(out_ptr, sizeof(float) * pass_stride); - - cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); - cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); - cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); - cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); - cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); - - kernel_set_args(ckNLMCalcDifference, - 0, - guide_mem, - variance_mem, - scale_mem, - difference_mem, - w, - h, - stride, - pass_stride, - r, - channel_offset, - 0, - a, - k_2); - kernel_set_args( - ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f); - kernel_set_args( - ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f); - kernel_set_args(ckNLMUpdateOutput, - 0, - blurDifference_mem, - image_mem, - out_mem, - weightAccum_mem, - w, - h, - stride, - pass_stride, - channel_offset, - r, - f); - - enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); - enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); - enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true); - - kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride); - enqueue_kernel(ckNLMNormalize, w, h); - - return true; -} - -bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task) -{ - cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); - cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - char use_time = task->buffer.use_time ? 1 : 0; - - cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); - - int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem); - cl_mem buffers[9]; - for (int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]); - } - kernel_set_args(ckFilterConstructTransform, - arg_ofs, - transform_mem, - rank_mem, - task->filter_area, - task->rect, - task->buffer.pass_stride, - task->buffer.frame_stride, - use_time, - task->radius, - task->pca_threshold); - - enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256); - - return true; -} - -bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) -{ - cl_mem color_mem = CL_MEM_PTR(color_ptr); - cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); - cl_mem scale_mem = CL_MEM_PTR(scale_ptr); - - cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); - cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); - cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); - - cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); - cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); - cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); - cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - char use_time = task->buffer.use_time ? 1 : 0; - - int r = task->radius; - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - - device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts); - device_sub_ptr blurDifference( - task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts); - cl_mem difference_mem = CL_MEM_PTR(*difference); - cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); - - kernel_set_args(ckNLMCalcDifference, - 0, - color_mem, - color_variance_mem, - scale_mem, - difference_mem, - w, - h, - stride, - pass_stride, - r, - pass_stride, - frame_offset, - 1.0f, - task->nlm_k_2); - kernel_set_args( - ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4); - kernel_set_args( - ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4); - kernel_set_args(ckNLMConstructGramian, - 0, - t, - blurDifference_mem, - buffer_mem, - transform_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - task->reconstruction_state.filter_window, - w, - h, - stride, - pass_stride, - r, - 4, - frame_offset, - use_time); - - enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); - enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); - enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256); - - return true; -} - -bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task) -{ - cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); - - cl_mem output_mem = CL_MEM_PTR(output_ptr); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); - cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - - kernel_set_args(ckFinalize, - 0, - output_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - task->filter_area, - task->reconstruction_state.buffer_params, - task->render_buffer.samples); - enqueue_kernel(ckFinalize, w, h); - - return true; -} - -bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task) -{ - cl_mem a_mem = CL_MEM_PTR(a_ptr); - cl_mem b_mem = CL_MEM_PTR(b_ptr); - cl_mem mean_mem = CL_MEM_PTR(mean_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - - cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); - - kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r); - enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - return true; -} - -bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task) -{ - cl_mem a_mem = CL_MEM_PTR(a_ptr); - cl_mem b_mem = CL_MEM_PTR(b_ptr); - cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); - cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); - cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); - - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); - - int arg_ofs = kernel_set_args( - ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem); - cl_mem buffers[9]; - for (int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]); - } - kernel_set_args(ckFilterDivideShadow, - arg_ofs, - a_mem, - b_mem, - sample_variance_mem, - sv_variance_mem, - buffer_variance_mem, - task->rect, - task->render_buffer.pass_stride, - task->render_buffer.offset); - enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - return true; -} - -bool OpenCLDevice::denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) -{ - cl_mem mean_mem = CL_MEM_PTR(mean_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); - - int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem); - cl_mem buffers[9]; - for (int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]); - } - kernel_set_args(ckFilterGetFeature, - arg_ofs, - mean_offset, - variance_offset, - mean_mem, - variance_mem, - scale, - task->rect, - task->render_buffer.pass_stride, - task->render_buffer.offset); - enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - return true; -} - -bool OpenCLDevice::denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) -{ - cl_mem from_mem = CL_MEM_PTR(from_ptr); - cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); - - cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); - - kernel_set_args(ckFilterWriteFeature, - 0, - task->render_buffer.samples, - task->reconstruction_state.buffer_params, - task->filter_area, - from_mem, - buffer_mem, - out_offset, - task->rect); - enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w); - - return true; -} - -bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) -{ - cl_mem image_mem = CL_MEM_PTR(image_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem depth_mem = CL_MEM_PTR(depth_ptr); - cl_mem output_mem = CL_MEM_PTR(output_ptr); - - cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); - - kernel_set_args(ckFilterDetectOutliers, - 0, - image_mem, - variance_mem, - depth_mem, - output_mem, - task->rect, - task->buffer.pass_stride); - enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - return true; -} - -void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising) -{ - denoising.functions.construct_transform = function_bind( - &OpenCLDevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind( - &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind( - &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind( - &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind( - &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind( - &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind( - &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind( - &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(rtile); -} - -void OpenCLDevice::shader(DeviceTask &task) -{ - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_input = CL_MEM_PTR(task.shader_input); - cl_mem d_output = CL_MEM_PTR(task.shader_output); - cl_int d_shader_eval_type = task.shader_eval_type; - cl_int d_shader_filter = task.shader_filter; - cl_int d_shader_x = task.shader_x; - cl_int d_shader_w = task.shader_w; - cl_int d_offset = task.offset; - - OpenCLDevice::OpenCLProgram *program = &background_program; - if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { - program = &displace_program; - } - program->wait_for_availability(); - cl_kernel kernel = (*program)(); - - cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output); - - set_kernel_arg_buffers(kernel, &start_arg_index); - - start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type); - if (task.shader_eval_type >= SHADER_EVAL_BAKE) { - start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter); - } - start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset); - - for (int sample = 0; sample < task.num_samples; sample++) { - - if (task.get_cancel()) - break; - - kernel_set_args(kernel, start_arg_index, sample); - - enqueue_kernel(kernel, task.shader_w, 1); - - clFinish(cqCommandQueue); - - task.update_progress(NULL); - } -} - -void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile) -{ - scoped_timer timer(&rtile.buffers->render_time); - - /* Cast arguments to cl types. */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); - cl_int d_x = rtile.x; - cl_int d_y = rtile.y; - cl_int d_w = rtile.w; - cl_int d_h = rtile.h; - cl_int d_offset = rtile.offset; - cl_int d_stride = rtile.stride; - - bake_program.wait_for_availability(); - cl_kernel kernel = bake_program(); - - cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer); - - set_kernel_arg_buffers(kernel, &start_arg_index); - - start_arg_index += kernel_set_args( - kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride); - - int start_sample = rtile.start_sample; - int end_sample = rtile.start_sample + rtile.num_samples; - - for (int sample = start_sample; sample < end_sample; sample++) { - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - - kernel_set_args(kernel, start_arg_index, sample); - - enqueue_kernel(kernel, d_w, d_h); - clFinish(cqCommandQueue); - - rtile.sample = sample + 1; - - task.update_progress(&rtile, rtile.w * rtile.h); - } -} - -static bool kernel_build_opencl_2(cl_device_id cdDevice) -{ - /* Build with OpenCL 2.0 if available, this improves performance - * with AMD OpenCL drivers on Windows and Linux (legacy drivers). - * Note that OpenCL selects the highest 1.x version by default, - * only for 2.0 do we need the explicit compiler flag. */ - int version_major, version_minor; - if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) { - if (version_major >= 2) { - /* This appears to trigger a driver bug in Radeon RX cards with certain - * driver version, so don't use OpenCL 2.0 for those. */ - string device_name = OpenCLInfo::get_readable_device_name(cdDevice); - if (string_startswith(device_name, "Radeon RX 4") || - string_startswith(device_name, "Radeon (TM) RX 4") || - string_startswith(device_name, "Radeon RX 5") || - string_startswith(device_name, "Radeon (TM) RX 5")) { - char version[256] = ""; - int driver_major, driver_minor; - clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); - if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) { - return !(driver_major == 3075 && driver_minor <= 12); - } - } - - return true; - } - } - - return false; -} - -string OpenCLDevice::kernel_build_options(const string *debug_src) -{ - string build_options = "-cl-no-signed-zeros -cl-mad-enable "; - - if (kernel_build_opencl_2(cdDevice)) { - build_options += "-cl-std=CL2.0 "; - } - - if (platform_name == "NVIDIA CUDA") { - build_options += - "-D__KERNEL_OPENCL_NVIDIA__ " - "-cl-nv-maxrregcount=32 " - "-cl-nv-verbose "; - - uint compute_capability_major, compute_capability_minor; - clGetDeviceInfo(cdDevice, - CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, - sizeof(cl_uint), - &compute_capability_major, - NULL); - clGetDeviceInfo(cdDevice, - CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, - sizeof(cl_uint), - &compute_capability_minor, - NULL); - - build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ", - compute_capability_major * 100 + compute_capability_minor * 10); - } - - else if (platform_name == "Apple") - build_options += "-D__KERNEL_OPENCL_APPLE__ "; - - else if (platform_name == "AMD Accelerated Parallel Processing") - build_options += "-D__KERNEL_OPENCL_AMD__ "; - - else if (platform_name == "Intel(R) OpenCL") { - build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ "; - - /* Options for gdb source level kernel debugging. - * this segfaults on linux currently. - */ - if (OpenCLInfo::use_debug() && debug_src) - build_options += "-g -s \"" + *debug_src + "\" "; - } - - if (info.has_half_images) { - build_options += "-D__KERNEL_CL_KHR_FP16__ "; - } - - if (OpenCLInfo::use_debug()) { - build_options += "-D__KERNEL_OPENCL_DEBUG__ "; - } - -# ifdef WITH_NANOVDB - if (info.has_nanovdb) { - build_options += "-DWITH_NANOVDB "; - } -# endif - - return build_options; -} - -/* TODO(sergey): In the future we can use variadic templates, once - * C++0x is allowed. Should allow to clean this up a bit. - */ -int OpenCLDevice::kernel_set_args(cl_kernel kernel, - int start_argument_index, - const ArgumentWrapper &arg1, - const ArgumentWrapper &arg2, - const ArgumentWrapper &arg3, - const ArgumentWrapper &arg4, - const ArgumentWrapper &arg5, - const ArgumentWrapper &arg6, - const ArgumentWrapper &arg7, - const ArgumentWrapper &arg8, - const ArgumentWrapper &arg9, - const ArgumentWrapper &arg10, - const ArgumentWrapper &arg11, - const ArgumentWrapper &arg12, - const ArgumentWrapper &arg13, - const ArgumentWrapper &arg14, - const ArgumentWrapper &arg15, - const ArgumentWrapper &arg16, - const ArgumentWrapper &arg17, - const ArgumentWrapper &arg18, - const ArgumentWrapper &arg19, - const ArgumentWrapper &arg20, - const ArgumentWrapper &arg21, - const ArgumentWrapper &arg22, - const ArgumentWrapper &arg23, - const ArgumentWrapper &arg24, - const ArgumentWrapper &arg25, - const ArgumentWrapper &arg26, - const ArgumentWrapper &arg27, - const ArgumentWrapper &arg28, - const ArgumentWrapper &arg29, - const ArgumentWrapper &arg30, - const ArgumentWrapper &arg31, - const ArgumentWrapper &arg32, - const ArgumentWrapper &arg33) -{ - int current_arg_index = 0; -# define FAKE_VARARG_HANDLE_ARG(arg) \ - do { \ - if (arg.pointer != NULL) { \ - opencl_assert(clSetKernelArg( \ - kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \ - ++current_arg_index; \ - } \ - else { \ - return current_arg_index; \ - } \ - } while (false) - FAKE_VARARG_HANDLE_ARG(arg1); - FAKE_VARARG_HANDLE_ARG(arg2); - FAKE_VARARG_HANDLE_ARG(arg3); - FAKE_VARARG_HANDLE_ARG(arg4); - FAKE_VARARG_HANDLE_ARG(arg5); - FAKE_VARARG_HANDLE_ARG(arg6); - FAKE_VARARG_HANDLE_ARG(arg7); - FAKE_VARARG_HANDLE_ARG(arg8); - FAKE_VARARG_HANDLE_ARG(arg9); - FAKE_VARARG_HANDLE_ARG(arg10); - FAKE_VARARG_HANDLE_ARG(arg11); - FAKE_VARARG_HANDLE_ARG(arg12); - FAKE_VARARG_HANDLE_ARG(arg13); - FAKE_VARARG_HANDLE_ARG(arg14); - FAKE_VARARG_HANDLE_ARG(arg15); - FAKE_VARARG_HANDLE_ARG(arg16); - FAKE_VARARG_HANDLE_ARG(arg17); - FAKE_VARARG_HANDLE_ARG(arg18); - FAKE_VARARG_HANDLE_ARG(arg19); - FAKE_VARARG_HANDLE_ARG(arg20); - FAKE_VARARG_HANDLE_ARG(arg21); - FAKE_VARARG_HANDLE_ARG(arg22); - FAKE_VARARG_HANDLE_ARG(arg23); - FAKE_VARARG_HANDLE_ARG(arg24); - FAKE_VARARG_HANDLE_ARG(arg25); - FAKE_VARARG_HANDLE_ARG(arg26); - FAKE_VARARG_HANDLE_ARG(arg27); - FAKE_VARARG_HANDLE_ARG(arg28); - FAKE_VARARG_HANDLE_ARG(arg29); - FAKE_VARARG_HANDLE_ARG(arg30); - FAKE_VARARG_HANDLE_ARG(arg31); - FAKE_VARARG_HANDLE_ARG(arg32); - FAKE_VARARG_HANDLE_ARG(arg33); -# undef FAKE_VARARG_HANDLE_ARG - return current_arg_index; -} - -void OpenCLDevice::release_kernel_safe(cl_kernel kernel) -{ - if (kernel) { - clReleaseKernel(kernel); - } -} - -void OpenCLDevice::release_mem_object_safe(cl_mem mem) -{ - if (mem != NULL) { - clReleaseMemObject(mem); - } -} - -void OpenCLDevice::release_program_safe(cl_program program) -{ - if (program) { - clReleaseProgram(program); - } -} - -/* ** Those guys are for working around some compiler-specific bugs ** */ - -cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker) -{ - return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker); -} - -void OpenCLDevice::store_cached_kernel(cl_program program, - ustring key, - thread_scoped_lock &cache_locker) -{ - OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker); -} - -Device *opencl_create_split_device(DeviceInfo &info, - Stats &stats, - Profiler &profiler, - bool background) -{ - return new OpenCLDevice(info, stats, profiler, background); -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp deleted file mode 100644 index 4330e07cb37..00000000000 --- a/intern/cycles/device/opencl/memory_manager.cpp +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -# include "util/util_foreach.h" - -# include "device/opencl/device_opencl.h" -# include "device/opencl/memory_manager.h" - -CCL_NAMESPACE_BEGIN - -void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation) -{ - allocations.push_back(&allocation); -} - -void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device) -{ - bool need_realloc = false; - - /* Calculate total size and remove any freed. */ - size_t total_size = 0; - - for (int i = allocations.size() - 1; i >= 0; i--) { - Allocation *allocation = allocations[i]; - - /* Remove allocations that have been freed. */ - if (!allocation->mem || allocation->mem->memory_size() == 0) { - allocation->device_buffer = NULL; - allocation->size = 0; - - allocations.erase(allocations.begin() + i); - - need_realloc = true; - - continue; - } - - /* Get actual size for allocation. */ - size_t alloc_size = align_up(allocation->mem->memory_size(), 16); - - if (allocation->size != alloc_size) { - /* Allocation is either new or resized. */ - allocation->size = alloc_size; - allocation->needs_copy_to_device = true; - - need_realloc = true; - } - - total_size += alloc_size; - } - - /* Always allocate non-empty buffer, NULL pointers cause problems with some drivers. */ - total_size = std::max(total_size, (size_t)16); - - if (need_realloc) { - cl_ulong max_buffer_size; - clGetDeviceInfo( - device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); - - if (total_size > max_buffer_size) { - device->set_error("Scene too complex to fit in available memory."); - return; - } - - device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device, - "memory manager buffer"); - - new_buffer->alloc_to_device(total_size); - - size_t offset = 0; - - foreach (Allocation *allocation, allocations) { - if (allocation->needs_copy_to_device) { - /* Copy from host to device. */ - opencl_device_assert(device, - clEnqueueWriteBuffer(device->cqCommandQueue, - CL_MEM_PTR(new_buffer->device_pointer), - CL_FALSE, - offset, - allocation->mem->memory_size(), - allocation->mem->host_pointer, - 0, - NULL, - NULL)); - - allocation->needs_copy_to_device = false; - } - else { - /* Fast copy from memory already on device. */ - opencl_device_assert(device, - clEnqueueCopyBuffer(device->cqCommandQueue, - CL_MEM_PTR(buffer->device_pointer), - CL_MEM_PTR(new_buffer->device_pointer), - allocation->desc.offset, - offset, - allocation->mem->memory_size(), - 0, - NULL, - NULL)); - } - - allocation->desc.offset = offset; - offset += allocation->size; - } - - delete buffer; - - buffer = new_buffer; - } - else { - assert(total_size == buffer->data_size); - - size_t offset = 0; - - foreach (Allocation *allocation, allocations) { - if (allocation->needs_copy_to_device) { - /* Copy from host to device. */ - opencl_device_assert(device, - clEnqueueWriteBuffer(device->cqCommandQueue, - CL_MEM_PTR(buffer->device_pointer), - CL_FALSE, - offset, - allocation->mem->memory_size(), - allocation->mem->host_pointer, - 0, - NULL, - NULL)); - - allocation->needs_copy_to_device = false; - } - - offset += allocation->size; - } - } - - /* Not really necessary, but seems to improve responsiveness for some reason. */ - clFinish(device->cqCommandQueue); -} - -void MemoryManager::DeviceBuffer::free(OpenCLDevice *) -{ - buffer->free(); -} - -MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer() -{ - DeviceBuffer *smallest = device_buffers; - - foreach (DeviceBuffer &device_buffer, device_buffers) { - if (device_buffer.size < smallest->size) { - smallest = &device_buffer; - } - } - - return smallest; -} - -MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false) -{ - foreach (DeviceBuffer &device_buffer, device_buffers) { - device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer"); - } -} - -void MemoryManager::free() -{ - foreach (DeviceBuffer &device_buffer, device_buffers) { - device_buffer.free(device); - } -} - -void MemoryManager::alloc(const char *name, device_memory &mem) -{ - Allocation &allocation = allocations[name]; - - allocation.mem = &mem; - allocation.needs_copy_to_device = true; - - if (!allocation.device_buffer) { - DeviceBuffer *device_buffer = smallest_device_buffer(); - allocation.device_buffer = device_buffer; - - allocation.desc.device_buffer = device_buffer - device_buffers; - - device_buffer->add_allocation(allocation); - - device_buffer->size += mem.memory_size(); - } - - need_update = true; -} - -bool MemoryManager::free(device_memory &mem) -{ - foreach (AllocationsMap::value_type &value, allocations) { - Allocation &allocation = value.second; - if (allocation.mem == &mem) { - - allocation.device_buffer->size -= mem.memory_size(); - - allocation.mem = NULL; - allocation.needs_copy_to_device = false; - - need_update = true; - return true; - } - } - - return false; -} - -MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name) -{ - update_device_memory(); - - Allocation &allocation = allocations[name]; - return allocation.desc; -} - -void MemoryManager::update_device_memory() -{ - if (!need_update) { - return; - } - - need_update = false; - - foreach (DeviceBuffer &device_buffer, device_buffers) { - device_buffer.update_device_memory(device); - } -} - -void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) -{ - update_device_memory(); - - foreach (DeviceBuffer &device_buffer, device_buffers) { - if (device_buffer.buffer->device_pointer) { - device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer); - } - else { - device->kernel_set_args(kernel, (*narg)++); - } - } -} - -CCL_NAMESPACE_END - -#endif /* WITH_OPENCL */ diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h deleted file mode 100644 index 23624f837a6..00000000000 --- a/intern/cycles/device/opencl/memory_manager.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "device/device.h" - -#include "util/util_map.h" -#include "util/util_string.h" -#include "util/util_vector.h" - -#include "clew.h" - -CCL_NAMESPACE_BEGIN - -class OpenCLDevice; - -class MemoryManager { - public: - static const int NUM_DEVICE_BUFFERS = 8; - - struct BufferDescriptor { - uint device_buffer; - cl_ulong offset; - }; - - private: - struct DeviceBuffer; - - struct Allocation { - device_memory *mem; - - DeviceBuffer *device_buffer; - size_t size; /* Size of actual allocation, may be larger than requested. */ - - BufferDescriptor desc; - - bool needs_copy_to_device; - - Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false) - { - } - }; - - struct DeviceBuffer { - device_only_memory<uchar> *buffer; - vector<Allocation *> allocations; - size_t size; /* Size of all allocations. */ - - DeviceBuffer() : buffer(NULL), size(0) - { - } - - ~DeviceBuffer() - { - delete buffer; - buffer = NULL; - } - - void add_allocation(Allocation &allocation); - - void update_device_memory(OpenCLDevice *device); - - void free(OpenCLDevice *device); - }; - - OpenCLDevice *device; - - DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS]; - - typedef unordered_map<string, Allocation> AllocationsMap; - AllocationsMap allocations; - - bool need_update; - - DeviceBuffer *smallest_device_buffer(); - - public: - MemoryManager(OpenCLDevice *device); - - void free(); /* Free all memory. */ - - void alloc(const char *name, device_memory &mem); - bool free(device_memory &mem); - - BufferDescriptor get_descriptor(string name); - - void update_device_memory(); - void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); -}; - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp deleted file mode 100644 index 3929cf77f15..00000000000 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ /dev/null @@ -1,1326 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -# include "device/device_intern.h" -# include "device/opencl/device_opencl.h" - -# include "util/util_debug.h" -# include "util/util_logging.h" -# include "util/util_md5.h" -# include "util/util_path.h" -# include "util/util_semaphore.h" -# include "util/util_system.h" -# include "util/util_time.h" - -using std::cerr; -using std::endl; - -CCL_NAMESPACE_BEGIN - -OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL) -{ -} - -OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs) - : program(rhs.program), mutex(NULL) -{ -} - -OpenCLCache::Slot::ProgramEntry::~ProgramEntry() -{ - delete mutex; -} - -OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL) -{ -} - -OpenCLCache::Slot::Slot(const Slot &rhs) - : context_mutex(NULL), context(NULL), programs(rhs.programs) -{ -} - -OpenCLCache::Slot::~Slot() -{ - delete context_mutex; -} - -OpenCLCache &OpenCLCache::global_instance() -{ - static OpenCLCache instance; - return instance; -} - -cl_context OpenCLCache::get_context(cl_platform_id platform, - cl_device_id device, - thread_scoped_lock &slot_locker) -{ - assert(platform != NULL); - - OpenCLCache &self = global_instance(); - - thread_scoped_lock cache_lock(self.cache_lock); - - pair<CacheMap::iterator, bool> ins = self.cache.insert( - CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); - - Slot &slot = ins.first->second; - - /* create slot lock only while holding cache lock */ - if (!slot.context_mutex) - slot.context_mutex = new thread_mutex; - - /* need to unlock cache before locking slot, to allow store to complete */ - cache_lock.unlock(); - - /* lock the slot */ - slot_locker = thread_scoped_lock(*slot.context_mutex); - - /* If the thing isn't cached */ - if (slot.context == NULL) { - /* return with the caller's lock holder holding the slot lock */ - return NULL; - } - - /* the item was already cached, release the slot lock */ - slot_locker.unlock(); - - cl_int ciErr = clRetainContext(slot.context); - assert(ciErr == CL_SUCCESS); - (void)ciErr; - - return slot.context; -} - -cl_program OpenCLCache::get_program(cl_platform_id platform, - cl_device_id device, - ustring key, - thread_scoped_lock &slot_locker) -{ - assert(platform != NULL); - - OpenCLCache &self = global_instance(); - - thread_scoped_lock cache_lock(self.cache_lock); - - pair<CacheMap::iterator, bool> ins = self.cache.insert( - CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); - - Slot &slot = ins.first->second; - - pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert( - Slot::EntryMap::value_type(key, Slot::ProgramEntry())); - - Slot::ProgramEntry &entry = ins2.first->second; - - /* create slot lock only while holding cache lock */ - if (!entry.mutex) - entry.mutex = new thread_mutex; - - /* need to unlock cache before locking slot, to allow store to complete */ - cache_lock.unlock(); - - /* lock the slot */ - slot_locker = thread_scoped_lock(*entry.mutex); - - /* If the thing isn't cached */ - if (entry.program == NULL) { - /* return with the caller's lock holder holding the slot lock */ - return NULL; - } - - /* the item was already cached, release the slot lock */ - slot_locker.unlock(); - - cl_int ciErr = clRetainProgram(entry.program); - assert(ciErr == CL_SUCCESS); - (void)ciErr; - - return entry.program; -} - -void OpenCLCache::store_context(cl_platform_id platform, - cl_device_id device, - cl_context context, - thread_scoped_lock &slot_locker) -{ - assert(platform != NULL); - assert(device != NULL); - assert(context != NULL); - - OpenCLCache &self = global_instance(); - - thread_scoped_lock cache_lock(self.cache_lock); - CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); - cache_lock.unlock(); - - Slot &slot = i->second; - - /* sanity check */ - assert(i != self.cache.end()); - assert(slot.context == NULL); - - slot.context = context; - - /* unlock the slot */ - slot_locker.unlock(); - - /* increment reference count in OpenCL. - * The caller is going to release the object when done with it. */ - cl_int ciErr = clRetainContext(context); - assert(ciErr == CL_SUCCESS); - (void)ciErr; -} - -void OpenCLCache::store_program(cl_platform_id platform, - cl_device_id device, - cl_program program, - ustring key, - thread_scoped_lock &slot_locker) -{ - assert(platform != NULL); - assert(device != NULL); - assert(program != NULL); - - OpenCLCache &self = global_instance(); - - thread_scoped_lock cache_lock(self.cache_lock); - - CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); - assert(i != self.cache.end()); - Slot &slot = i->second; - - Slot::EntryMap::iterator i2 = slot.programs.find(key); - assert(i2 != slot.programs.end()); - Slot::ProgramEntry &entry = i2->second; - - assert(entry.program == NULL); - - cache_lock.unlock(); - - entry.program = program; - - /* unlock the slot */ - slot_locker.unlock(); - - /* Increment reference count in OpenCL. - * The caller is going to release the object when done with it. - */ - cl_int ciErr = clRetainProgram(program); - assert(ciErr == CL_SUCCESS); - (void)ciErr; -} - -string OpenCLCache::get_kernel_md5() -{ - OpenCLCache &self = global_instance(); - thread_scoped_lock lock(self.kernel_md5_lock); - - if (self.kernel_md5.empty()) { - self.kernel_md5 = path_files_md5_hash(path_get("source")); - } - return self.kernel_md5; -} - -static string get_program_source(const string &kernel_file) -{ - string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; - /* We compile kernels consisting of many files. unfortunately OpenCL - * kernel caches do not seem to recognize changes in included files. - * so we force recompile on changes by adding the md5 hash of all files. - */ - source = path_source_replace_includes(source, path_get("source")); - source += "\n// " + util_md5_string(source) + "\n"; - return source; -} - -OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device, - const string &program_name, - const string &kernel_file, - const string &kernel_build_options, - bool use_stdout) - : device(device), - program_name(program_name), - kernel_file(kernel_file), - kernel_build_options(kernel_build_options), - use_stdout(use_stdout) -{ - loaded = false; - needs_compiling = true; - program = NULL; -} - -OpenCLDevice::OpenCLProgram::~OpenCLProgram() -{ - release(); -} - -void OpenCLDevice::OpenCLProgram::release() -{ - for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); - ++kernel) { - if (kernel->second) { - clReleaseKernel(kernel->second); - kernel->second = NULL; - } - } - if (program) { - clReleaseProgram(program); - program = NULL; - } -} - -void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug) -{ - if (!use_stdout) { - log += msg + "\n"; - } - else if (!debug) { - printf("%s\n", msg.c_str()); - fflush(stdout); - } - else { - VLOG(2) << msg; - } -} - -void OpenCLDevice::OpenCLProgram::add_error(const string &msg) -{ - if (use_stdout) { - fprintf(stderr, "%s\n", msg.c_str()); - } - if (error_msg == "") { - error_msg += "\n"; - } - error_msg += msg; -} - -void OpenCLDevice::OpenCLProgram::add_kernel(ustring name) -{ - if (!kernels.count(name)) { - kernels[name] = NULL; - } -} - -bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src) -{ - string build_options; - build_options = device->kernel_build_options(debug_src) + kernel_build_options; - - VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'."; - cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); - - /* show warnings even if build is successful */ - size_t ret_val_size = 0; - - clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); - - if (ciErr != CL_SUCCESS) { - add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) + - ", errors in console."); - } - - if (ret_val_size > 1) { - vector<char> build_log(ret_val_size + 1); - clGetProgramBuildInfo( - program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL); - - build_log[ret_val_size] = '\0'; - /* Skip meaningless empty output from the NVidia compiler. */ - if (!(ret_val_size == 2 && build_log[0] == '\n')) { - add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]), - ciErr == CL_SUCCESS); - } - } - - return (ciErr == CL_SUCCESS); -} - -bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src) -{ - string source = get_program_source(kernel_file); - - if (debug_src) { - path_write_text(*debug_src, source); - } - - size_t source_len = source.size(); - const char *source_str = source.c_str(); - cl_int ciErr; - - program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr); - - if (ciErr != CL_SUCCESS) { - add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr)); - return false; - } - - double starttime = time_dt(); - add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); - add_log(string("Build flags: ") + kernel_build_options, true); - - if (!build_kernel(debug_src)) - return false; - - double elapsed = time_dt() - starttime; - add_log( - string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), - false); - - return true; -} - -static void escape_python_string(string &str) -{ - /* Escape string to be passed as a Python raw string with '' quotes'. */ - string_replace(str, "'", "\'"); -} - -static int opencl_compile_process_limit() -{ - /* Limit number of concurrent processes compiling, with a heuristic based - * on total physical RAM and estimate of memory usage needed when compiling - * with all Cycles features enabled. - * - * This is somewhat arbitrary as we don't know the actual available RAM or - * how much the kernel compilation will needed depending on the features, but - * better than not limiting at all. */ - static const int64_t GB = 1024LL * 1024LL * 1024LL; - static const int64_t process_memory = 2 * GB; - static const int64_t base_memory = 2 * GB; - static const int64_t system_memory = system_physical_ram(); - static const int64_t process_limit = (system_memory - base_memory) / process_memory; - - return max((int)process_limit, 1); -} - -bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin) -{ - /* Construct arguments. */ - vector<string> args; - args.push_back("--background"); - args.push_back("--factory-startup"); - args.push_back("--python-expr"); - - int device_platform_id = device->device_num; - string device_name = device->device_name; - string platform_name = device->platform_name; - string build_options = device->kernel_build_options(NULL) + kernel_build_options; - string kernel_file_escaped = kernel_file; - string clbin_escaped = clbin; - - escape_python_string(device_name); - escape_python_string(platform_name); - escape_python_string(build_options); - escape_python_string(kernel_file_escaped); - escape_python_string(clbin_escaped); - - args.push_back(string_printf( - "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')", - device_platform_id, - device_name.c_str(), - platform_name.c_str(), - build_options.c_str(), - kernel_file_escaped.c_str(), - clbin_escaped.c_str())); - - /* Limit number of concurrent processes compiling. */ - static thread_counting_semaphore semaphore(opencl_compile_process_limit()); - semaphore.acquire(); - - /* Compile. */ - const double starttime = time_dt(); - add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); - add_log(string("Build flags: ") + kernel_build_options, true); - const bool success = system_call_self(args); - const double elapsed = time_dt() - starttime; - - semaphore.release(); - - if (!success || !path_exists(clbin)) { - return false; - } - - add_log( - string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), - false); - - return load_binary(clbin); -} - -/* Compile opencl kernel. This method is called from the _cycles Python - * module compile kernels. Parameters must match function above. */ -bool device_opencl_compile_kernel(const vector<string> ¶meters) -{ - int device_platform_id = std::stoi(parameters[0]); - const string &device_name = parameters[1]; - const string &platform_name = parameters[2]; - const string &build_options = parameters[3]; - const string &kernel_file = parameters[4]; - const string &binary_path = parameters[5]; - - if (clewInit() != CLEW_SUCCESS) { - return false; - } - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - if (device_platform_id >= usable_devices.size()) { - return false; - } - - OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id]; - if (platform_device.platform_name != platform_name || - platform_device.device_name != device_name) { - return false; - } - - cl_platform_id platform = platform_device.platform_id; - cl_device_id device = platform_device.device_id; - const cl_context_properties context_props[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0}; - - cl_int err; - cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err); - if (err != CL_SUCCESS) { - return false; - } - - string source = get_program_source(kernel_file); - size_t source_len = source.size(); - const char *source_str = source.c_str(); - cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err); - bool result = false; - - if (err == CL_SUCCESS) { - err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); - - if (err == CL_SUCCESS) { - size_t size = 0; - clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); - if (size > 0) { - vector<uint8_t> binary(size); - uint8_t *bytes = &binary[0]; - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL); - result = path_write_binary(binary_path, binary); - } - } - clReleaseProgram(program); - } - - clReleaseContext(context); - - return result; -} - -bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src) -{ - /* read binary into memory */ - vector<uint8_t> binary; - - if (!path_read_binary(clbin, binary)) { - add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str())); - return false; - } - - /* create program */ - cl_int status, ciErr; - size_t size = binary.size(); - const uint8_t *bytes = &binary[0]; - - program = clCreateProgramWithBinary( - device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr); - - if (status != CL_SUCCESS || ciErr != CL_SUCCESS) { - add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " + - clewErrorString(status) + " " + clewErrorString(ciErr)); - return false; - } - - if (!build_kernel(debug_src)) - return false; - - return true; -} - -bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin) -{ - size_t size = 0; - clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); - - if (!size) - return false; - - vector<uint8_t> binary(size); - uint8_t *bytes = &binary[0]; - - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL); - - return path_write_binary(clbin, binary); -} - -bool OpenCLDevice::OpenCLProgram::load() -{ - loaded = false; - string device_md5 = device->device_md5_hash(kernel_build_options); - - /* Try to use cached kernel. */ - thread_scoped_lock cache_locker; - ustring cache_key(program_name + device_md5); - program = device->load_cached_kernel(cache_key, cache_locker); - if (!program) { - add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - - /* need to create source to get md5 */ - string source = get_program_source(kernel_file); - - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + - util_md5_string(source); - basename = path_cache_get(path_join("kernels", basename)); - string clbin = basename + ".clbin"; - - /* If binary kernel exists already, try use it. */ - if (path_exists(clbin) && load_binary(clbin)) { - /* Kernel loaded from binary, nothing to do. */ - add_log(string("Loaded program from ") + clbin + ".", true); - - /* Cache the program. */ - device->store_cached_kernel(program, cache_key, cache_locker); - } - else { - add_log(string("OpenCL program ") + program_name + " not found on disk.", true); - cache_locker.unlock(); - } - } - - if (program) { - create_kernels(); - loaded = true; - needs_compiling = false; - } - - return loaded; -} - -void OpenCLDevice::OpenCLProgram::compile() -{ - assert(device); - - string device_md5 = device->device_md5_hash(kernel_build_options); - - /* Try to use cached kernel. */ - thread_scoped_lock cache_locker; - ustring cache_key(program_name + device_md5); - program = device->load_cached_kernel(cache_key, cache_locker); - - if (!program) { - - add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - - /* need to create source to get md5 */ - string source = get_program_source(kernel_file); - - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + - util_md5_string(source); - basename = path_cache_get(path_join("kernels", basename)); - string clbin = basename + ".clbin"; - - /* path to preprocessed source for debugging */ - string clsrc, *debug_src = NULL; - - if (OpenCLInfo::use_debug()) { - clsrc = basename + ".cl"; - debug_src = &clsrc; - } - - if (DebugFlags().running_inside_blender && compile_separate(clbin)) { - add_log(string("Built and loaded program from ") + clbin + ".", true); - loaded = true; - } - else { - if (DebugFlags().running_inside_blender) { - add_log(string("Separate-process building of ") + clbin + - " failed, will fall back to regular building.", - true); - } - - /* If does not exist or loading binary failed, compile kernel. */ - if (!compile_kernel(debug_src)) { - needs_compiling = false; - return; - } - - /* Save binary for reuse. */ - if (!save_binary(clbin)) { - add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true); - } - } - - /* Cache the program. */ - device->store_cached_kernel(program, cache_key, cache_locker); - } - - create_kernels(); - needs_compiling = false; - loaded = true; -} - -void OpenCLDevice::OpenCLProgram::create_kernels() -{ - for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); - ++kernel) { - assert(kernel->second == NULL); - cl_int ciErr; - string name = "kernel_ocl_" + kernel->first.string(); - kernel->second = clCreateKernel(program, name.c_str(), &ciErr); - if (device->opencl_error(ciErr)) { - add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " + - clewErrorString(ciErr)); - return; - } - } -} - -bool OpenCLDevice::OpenCLProgram::wait_for_availability() -{ - add_log(string("Waiting for availability of ") + program_name + ".", true); - while (needs_compiling) { - time_sleep(0.1); - } - return loaded; -} - -void OpenCLDevice::OpenCLProgram::report_error() -{ - /* If loaded is true, there was no error. */ - if (loaded) - return; - /* if use_stdout is true, the error was already reported. */ - if (use_stdout) - return; - - cerr << error_msg << endl; - if (!compile_output.empty()) { - cerr << "OpenCL kernel build output for " << program_name << ":" << endl; - cerr << compile_output << endl; - } -} - -cl_kernel OpenCLDevice::OpenCLProgram::operator()() -{ - assert(kernels.size() == 1); - return kernels.begin()->second; -} - -cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name) -{ - assert(kernels.count(name)); - return kernels[name]; -} - -cl_device_type OpenCLInfo::device_type() -{ - switch (DebugFlags().opencl.device_type) { - case DebugFlags::OpenCL::DEVICE_NONE: - return 0; - case DebugFlags::OpenCL::DEVICE_ALL: - return CL_DEVICE_TYPE_ALL; - case DebugFlags::OpenCL::DEVICE_DEFAULT: - return CL_DEVICE_TYPE_DEFAULT; - case DebugFlags::OpenCL::DEVICE_CPU: - return CL_DEVICE_TYPE_CPU; - case DebugFlags::OpenCL::DEVICE_GPU: - return CL_DEVICE_TYPE_GPU; - case DebugFlags::OpenCL::DEVICE_ACCELERATOR: - return CL_DEVICE_TYPE_ACCELERATOR; - default: - return CL_DEVICE_TYPE_ALL; - } -} - -bool OpenCLInfo::use_debug() -{ - return DebugFlags().opencl.debug; -} - -bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id) -{ - cl_device_type device_type; - if (!get_device_type(device_id, &device_type)) { - return false; - } - string device_name; - if (!get_device_name(device_id, &device_name)) { - return false; - } - - int driver_major = 0; - int driver_minor = 0; - if (!get_driver_version(device_id, &driver_major, &driver_minor)) { - return false; - } - VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor; - - if (getenv("CYCLES_OPENCL_TEST")) { - return true; - } - - /* Allow Intel GPUs on Intel OpenCL platform. */ - if (platform_name.find("Intel") != string::npos) { - if (device_type != CL_DEVICE_TYPE_GPU) { - /* OpenCL on Intel CPU is not an officially supported configuration. - * Use hybrid CPU+GPU rendering to utilize both GPU and CPU. */ - return false; - } - -# ifdef __APPLE__ - /* Apple uses own framework, which can also put Iris onto AMD frame-work. - * This isn't supported configuration. */ - return false; -# else - if (device_name.find("Iris") != string::npos || device_name.find("Xe") != string::npos) { - return true; - } -# endif - } - - if (platform_name == "AMD Accelerated Parallel Processing" && - device_type == CL_DEVICE_TYPE_GPU) { - if (driver_major < 2236) { - VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported."; - return false; - } - const char *blacklist[] = {/* GCN 1 */ - "Tahiti", - "Pitcairn", - "Capeverde", - "Oland", - "Hainan", - NULL}; - for (int i = 0; blacklist[i] != NULL; i++) { - if (device_name == blacklist[i]) { - VLOG(1) << "AMD device " << device_name << " not supported"; - return false; - } - } - return true; - } - if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { - return false; - } - return false; -} - -bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error) -{ - const int req_major = 1, req_minor = 1; - int major, minor; - char version[256]; - clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL); - if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) { - if (error != NULL) { - *error = string_printf("OpenCL: failed to parse platform version string (%s).", version); - } - return false; - } - if (!((major == req_major && minor >= req_minor) || (major > req_major))) { - if (error != NULL) { - *error = string_printf( - "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor); - } - return false; - } - if (error != NULL) { - *error = ""; - } - return true; -} - -bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error) -{ - char version[256]; - clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL); - if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) { - if (error != NULL) { - *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version); - } - return false; - } - if (error != NULL) { - *error = ""; - } - return true; -} - -bool OpenCLInfo::device_version_check(cl_device_id device, string *error) -{ - const int req_major = 1, req_minor = 1; - int major, minor; - if (!get_device_version(device, &major, &minor, error)) { - return false; - } - - if (!((major == req_major && minor >= req_minor) || (major > req_major))) { - if (error != NULL) { - *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor); - } - return false; - } - if (error != NULL) { - *error = ""; - } - return true; -} - -string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id) -{ - if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") { - /* Use cl_amd_device_topology extension. */ - cl_char topology[24]; - if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS && - topology[0] == 1) { - return string_printf("%02x:%02x.%01x", - (unsigned int)topology[21], - (unsigned int)topology[22], - (unsigned int)topology[23]); - } - } - else if (platform_name == "NVIDIA CUDA") { - /* Use two undocumented options of the cl_nv_device_attribute_query extension. */ - cl_int bus_id, slot_id; - if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS && - clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) { - return string_printf("%02x:%02x.%01x", - (unsigned int)(bus_id), - (unsigned int)(slot_id >> 3), - (unsigned int)(slot_id & 0x7)); - } - } - /* No general way to get a hardware ID from OpenCL => give up. */ - return ""; -} - -void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices) -{ - const cl_device_type device_type = OpenCLInfo::device_type(); - static bool first_time = true; -# define FIRST_VLOG(severity) \ - if (first_time) \ - VLOG(severity) - - usable_devices->clear(); - - if (device_type == 0) { - FIRST_VLOG(2) << "OpenCL devices are forced to be disabled."; - first_time = false; - return; - } - - cl_int error; - vector<cl_device_id> device_ids; - vector<cl_platform_id> platform_ids; - - /* Get platforms. */ - if (!get_platforms(&platform_ids, &error)) { - FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error)); - first_time = false; - return; - } - if (platform_ids.size() == 0) { - FIRST_VLOG(2) << "No OpenCL platforms were found."; - first_time = false; - return; - } - /* Devices are numbered consecutively across platforms. */ - for (int platform = 0; platform < platform_ids.size(); platform++) { - cl_platform_id platform_id = platform_ids[platform]; - string platform_name; - if (!get_platform_name(platform_id, &platform_name)) { - FIRST_VLOG(2) << "Failed to get platform name, ignoring."; - continue; - } - FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << "."; - if (!platform_version_check(platform_id)) { - FIRST_VLOG(2) << "Ignoring platform " << platform_name - << " due to too old compiler version."; - continue; - } - if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) { - FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", failed to fetch of devices: " << string(clewErrorString(error)); - continue; - } - if (device_ids.size() == 0) { - FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices."; - continue; - } - for (int num = 0; num < device_ids.size(); num++) { - const cl_device_id device_id = device_ids[num]; - string device_name; - if (!get_device_name(device_id, &device_name, &error)) { - FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error)) - << ", ignoring."; - continue; - } - if (!device_version_check(device_id)) { - FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version."; - continue; - } - if (device_supported(platform_name, device_id)) { - cl_device_type device_type; - if (!get_device_type(device_id, &device_type, &error)) { - FIRST_VLOG(2) << "Ignoring device " << device_name - << ", failed to fetch device type:" << string(clewErrorString(error)); - continue; - } - string readable_device_name = get_readable_device_name(device_id); - if (readable_device_name != device_name) { - FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name; - } - FIRST_VLOG(2) << "Adding new device " << readable_device_name << "."; - string hardware_id = get_hardware_id(platform_name, device_id); - string device_extensions = get_device_extensions(device_id); - usable_devices->push_back(OpenCLPlatformDevice(platform_id, - platform_name, - device_id, - device_type, - readable_device_name, - hardware_id, - device_extensions)); - } - else { - FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet."; - } - } - } - first_time = false; -} - -bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error) -{ - /* Reset from possible previous state. */ - platform_ids->resize(0); - cl_uint num_platforms; - if (!get_num_platforms(&num_platforms, error)) { - return false; - } - /* Get actual platforms. */ - cl_int err; - platform_ids->resize(num_platforms); - if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - return true; -} - -vector<cl_platform_id> OpenCLInfo::get_platforms() -{ - vector<cl_platform_id> platform_ids; - get_platforms(&platform_ids); - return platform_ids; -} - -bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error) -{ - cl_int err; - if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *num_platforms = 0; - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - return true; -} - -cl_uint OpenCLInfo::get_num_platforms() -{ - cl_uint num_platforms; - if (!get_num_platforms(&num_platforms)) { - return 0; - } - return num_platforms; -} - -bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name) -{ - char buffer[256]; - if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) != - CL_SUCCESS) { - *platform_name = ""; - return false; - } - *platform_name = buffer; - return true; -} - -string OpenCLInfo::get_platform_name(cl_platform_id platform_id) -{ - string platform_name; - if (!get_platform_name(platform_id, &platform_name)) { - return ""; - } - return platform_name; -} - -bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - cl_uint *num_devices, - cl_int *error) -{ - cl_int err; - if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *num_devices = 0; - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - return true; -} - -cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, - cl_device_type device_type) -{ - cl_uint num_devices; - if (!get_num_platform_devices(platform_id, device_type, &num_devices)) { - return 0; - } - return num_devices; -} - -bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - vector<cl_device_id> *device_ids, - cl_int *error) -{ - /* Reset from possible previous state. */ - device_ids->resize(0); - /* Get number of devices to pre-allocate memory. */ - cl_uint num_devices; - if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) { - return false; - } - /* Get actual device list. */ - device_ids->resize(num_devices); - cl_int err; - if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) != - CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - return true; -} - -vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type) -{ - vector<cl_device_id> devices; - get_platform_devices(platform_id, device_type, &devices); - return devices; -} - -bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error) -{ - char buffer[1024]; - cl_int err; - if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) != - CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *device_name = ""; - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - *device_name = buffer; - return true; -} - -string OpenCLInfo::get_device_name(cl_device_id device_id) -{ - string device_name; - if (!get_device_name(device_id, &device_name)) { - return ""; - } - return device_name; -} - -bool OpenCLInfo::get_device_extensions(cl_device_id device_id, - string *device_extensions, - cl_int *error) -{ - size_t extension_length = 0; - cl_int err; - /* Determine the size of the extension string. */ - if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &extension_length)) != - CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *device_extensions = ""; - return false; - } - vector<char> buffer(extension_length); - if ((err = clGetDeviceInfo( - device_id, CL_DEVICE_EXTENSIONS, extension_length, buffer.data(), NULL)) != - CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *device_extensions = ""; - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - *device_extensions = string(buffer.data()); - return true; -} - -string OpenCLInfo::get_device_extensions(cl_device_id device_id) -{ - string device_extensions; - if (!get_device_extensions(device_id, &device_extensions)) { - return ""; - } - return device_extensions; -} - -bool OpenCLInfo::get_device_type(cl_device_id device_id, - cl_device_type *device_type, - cl_int *error) -{ - cl_int err; - if ((err = clGetDeviceInfo( - device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *device_type = 0; - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - return true; -} - -cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id) -{ - cl_device_type device_type; - if (!get_device_type(device_id, &device_type)) { - return 0; - } - return device_type; -} - -string OpenCLInfo::get_readable_device_name(cl_device_id device_id) -{ - string name = ""; - char board_name[1024]; - size_t length = 0; - if (clGetDeviceInfo( - device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) == - CL_SUCCESS) { - if (length != 0 && board_name[0] != '\0') { - name = board_name; - } - } - - /* Fallback to standard device name API. */ - if (name.empty()) { - name = get_device_name(device_id); - } - - /* Special exception for AMD Vega, need to be able to tell - * Vega 56 from 64 apart. - */ - if (name == "Radeon RX Vega") { - cl_int max_compute_units = 0; - if (clGetDeviceInfo(device_id, - CL_DEVICE_MAX_COMPUTE_UNITS, - sizeof(max_compute_units), - &max_compute_units, - NULL) == CL_SUCCESS) { - name += " " + to_string(max_compute_units); - } - } - - /* Distinguish from our native CPU device. */ - if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) { - name += " (OpenCL)"; - } - - return name; -} - -bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error) -{ - char buffer[1024]; - cl_int err; - if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) != - CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - if (sscanf(buffer, "%d.%d", major, minor) < 2) { - VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer); - return false; - } - return true; -} - -int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id) -{ - int base_align_bits; - if (clGetDeviceInfo( - device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) == - CL_SUCCESS) { - return base_align_bits / 8; - } - return 1; -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp new file mode 100644 index 00000000000..13f23bd229a --- /dev/null +++ b/intern/cycles/device/optix/device.cpp @@ -0,0 +1,105 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/optix/device.h" + +#include "device/cuda/device.h" +#include "device/optix/device_impl.h" +#include "util/util_logging.h" + +#ifdef WITH_OPTIX +# include <optix_function_table_definition.h> +#endif + +CCL_NAMESPACE_BEGIN + +bool device_optix_init() +{ +#ifdef WITH_OPTIX + if (g_optixFunctionTable.optixDeviceContextCreate != NULL) { + /* Already initialized function table. */ + return true; + } + + /* Need to initialize CUDA as well. */ + if (!device_cuda_init()) { + return false; + } + + const OptixResult result = optixInit(); + + if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) { + VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. " + "Please update to the latest driver first!"; + return false; + } + else if (result != OPTIX_SUCCESS) { + VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result; + return false; + } + + /* Loaded OptiX successfully! */ + return true; +#else + return false; +#endif +} + +void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices) +{ +#ifdef WITH_OPTIX + devices.reserve(cuda_devices.size()); + + /* Simply add all supported CUDA devices as OptiX devices again. */ + for (DeviceInfo info : cuda_devices) { + assert(info.type == DEVICE_CUDA); + + int major; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num); + if (major < 5) { + /* Only Maxwell and up are supported by OptiX. */ + continue; + } + + info.type = DEVICE_OPTIX; + info.id += "_OptiX"; + info.denoisers |= DENOISER_OPTIX; + + devices.push_back(info); + } +#else + (void)cuda_devices; + (void)devices; +#endif +} + +Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ +#ifdef WITH_OPTIX + return new OptiXDevice(info, stats, profiler); +#else + (void)info; + (void)stats; + (void)profiler; + + LOG(FATAL) << "Request to create OptiX device without compiled-in support. Should never happen."; + + return nullptr; +#endif +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/optix/device.h b/intern/cycles/device/optix/device.h new file mode 100644 index 00000000000..29fa729c2e4 --- /dev/null +++ b/intern/cycles/device/optix/device.h @@ -0,0 +1,35 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_string.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_optix_init(); + +Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp new file mode 100644 index 00000000000..b54d423a183 --- /dev/null +++ b/intern/cycles/device/optix/device_impl.cpp @@ -0,0 +1,1573 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_OPTIX + +# include "device/optix/device_impl.h" + +# include "bvh/bvh.h" +# include "bvh/bvh_optix.h" +# include "integrator/pass_accessor_gpu.h" +# include "render/buffers.h" +# include "render/hair.h" +# include "render/mesh.h" +# include "render/object.h" +# include "render/pass.h" +# include "render/scene.h" + +# include "util/util_debug.h" +# include "util/util_logging.h" +# include "util/util_md5.h" +# include "util/util_path.h" +# include "util/util_progress.h" +# include "util/util_time.h" + +# undef __KERNEL_CPU__ +# define __KERNEL_OPTIX__ +# include "kernel/device/optix/globals.h" + +CCL_NAMESPACE_BEGIN + +OptiXDevice::Denoiser::Denoiser(OptiXDevice *device) + : device(device), queue(device), state(device, "__denoiser_state") +{ +} + +OptiXDevice::Denoiser::~Denoiser() +{ + const CUDAContextScope scope(device); + if (optix_denoiser != nullptr) { + optixDenoiserDestroy(optix_denoiser); + } +} + +OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : CUDADevice(info, stats, profiler), + sbt_data(this, "__sbt", MEM_READ_ONLY), + launch_params(this, "__params"), + denoiser_(this) +{ + /* Make the CUDA context current. */ + if (!cuContext) { + /* Do not initialize if CUDA context creation failed already. */ + return; + } + const CUDAContextScope scope(this); + + /* Create OptiX context for this device. */ + OptixDeviceContextOptions options = {}; +# ifdef WITH_CYCLES_LOGGING + options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */ + options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) { + switch (level) { + case 1: + LOG_IF(FATAL, VLOG_IS_ON(1)) << message; + break; + case 2: + LOG_IF(ERROR, VLOG_IS_ON(1)) << message; + break; + case 3: + LOG_IF(WARNING, VLOG_IS_ON(1)) << message; + break; + case 4: + LOG_IF(INFO, VLOG_IS_ON(1)) << message; + break; + } + }; +# endif + if (DebugFlags().optix.use_debug) { + options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL; + } + optix_assert(optixDeviceContextCreate(cuContext, &options, &context)); +# ifdef WITH_CYCLES_LOGGING + optix_assert(optixDeviceContextSetLogCallback( + context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel)); +# endif + + /* Fix weird compiler bug that assigns wrong size. */ + launch_params.data_elements = sizeof(KernelParamsOptiX); + + /* Allocate launch parameter buffer memory on device. */ + launch_params.alloc_to_device(1); +} + +OptiXDevice::~OptiXDevice() +{ + /* Make CUDA context current. */ + const CUDAContextScope scope(this); + + free_bvh_memory_delayed(); + + sbt_data.free(); + texture_info.free(); + launch_params.free(); + + /* Unload modules. */ + if (optix_module != NULL) { + optixModuleDestroy(optix_module); + } + for (unsigned int i = 0; i < 2; ++i) { + if (builtin_modules[i] != NULL) { + optixModuleDestroy(builtin_modules[i]); + } + } + for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { + if (pipelines[i] != NULL) { + optixPipelineDestroy(pipelines[i]); + } + } + + optixDeviceContextDestroy(context); +} + +unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create() +{ + return make_unique<OptiXDeviceQueue>(this); +} + +BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const +{ + /* OptiX has its own internal acceleration structure format. */ + return BVH_LAYOUT_OPTIX; +} + +string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features) +{ + string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features); + + /* Add OptiX SDK include directory to include paths. */ + const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR"); + if (optix_sdk_path) { + common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path); + } + + /* Specialization for shader raytracing. */ + if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + common_cflags += " --keep-device-functions"; + } + + return common_cflags; +} + +bool OptiXDevice::load_kernels(const uint kernel_features) +{ + if (have_error()) { + /* Abort early if context creation failed already. */ + return false; + } + + /* Load CUDA modules because we need some of the utility kernels. */ + if (!CUDADevice::load_kernels(kernel_features)) { + return false; + } + + /* Skip creating OptiX module if only doing denoising. */ + if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) { + return true; + } + + const CUDAContextScope scope(this); + + /* Unload existing OptiX module and pipelines first. */ + if (optix_module != NULL) { + optixModuleDestroy(optix_module); + optix_module = NULL; + } + for (unsigned int i = 0; i < 2; ++i) { + if (builtin_modules[i] != NULL) { + optixModuleDestroy(builtin_modules[i]); + builtin_modules[i] = NULL; + } + } + for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { + if (pipelines[i] != NULL) { + optixPipelineDestroy(pipelines[i]); + pipelines[i] = NULL; + } + } + + OptixModuleCompileOptions module_options = {}; + module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */ + + if (DebugFlags().optix.use_debug) { + module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0; + module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; + } + else { + module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3; + module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; + } + + module_options.boundValues = nullptr; + module_options.numBoundValues = 0; + + OptixPipelineCompileOptions pipeline_options = {}; + /* Default to no motion blur and two-level graph, since it is the fastest option. */ + pipeline_options.usesMotionBlur = false; + pipeline_options.traversableGraphFlags = + OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING; + pipeline_options.numPayloadValues = 6; + pipeline_options.numAttributeValues = 2; /* u, v */ + pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE; + pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */ + + pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; + if (kernel_features & KERNEL_FEATURE_HAIR) { + if (kernel_features & KERNEL_FEATURE_HAIR_THICK) { + pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE; + } + else + pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM; + } + + /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds + * This is necessary since objects may be reported to have motion if the Vector pass is + * active, but may still need to be rendered without motion blur if that isn't active as well. */ + motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0; + + if (motion_blur) { + pipeline_options.usesMotionBlur = true; + /* Motion blur can insert motion transforms into the traversal graph. + * It is no longer a two-level graph then, so need to set flags to allow any configuration. */ + pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY; + } + + { /* Load and compile PTX module with OptiX kernels. */ + string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? + "lib/kernel_optix_shader_raytrace.ptx" : + "lib/kernel_optix.ptx"); + if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { + if (!getenv("OPTIX_ROOT_DIR")) { + set_error( + "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to " + "the Optix SDK to be able to compile Optix kernels on demand)."); + return false; + } + ptx_filename = compile_kernel( + kernel_features, + (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel", + "optix", + true); + } + if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) { + set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str())); + return false; + } + + const OptixResult result = optixModuleCreateFromPTX(context, + &module_options, + &pipeline_options, + ptx_data.data(), + ptx_data.size(), + nullptr, + 0, + &optix_module); + if (result != OPTIX_SUCCESS) { + set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)", + ptx_filename.c_str(), + optixGetErrorName(result))); + return false; + } + } + + /* Create program groups. */ + OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; + OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {}; + OptixProgramGroupOptions group_options = {}; /* There are no options currently. */ + group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module; + group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_intersect_closest"; + group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module; + group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_intersect_shadow"; + group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module; + group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_intersect_subsurface"; + group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module; + group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_intersect_volume_stack"; + group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS; + group_descs[PG_MISS].miss.module = optix_module; + group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss"; + group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITD].hitgroup.moduleCH = optix_module; + group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit"; + group_descs[PG_HITD].hitgroup.moduleAH = optix_module; + group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test"; + group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITS].hitgroup.moduleAH = optix_module; + group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit"; + + if (kernel_features & KERNEL_FEATURE_HAIR) { + if (kernel_features & KERNEL_FEATURE_HAIR_THICK) { + /* Built-in thick curve intersection. */ + OptixBuiltinISOptions builtin_options = {}; + builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; + builtin_options.usesMotionBlur = false; + + optix_assert(optixBuiltinISModuleGet( + context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0])); + + group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0]; + group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr; + group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0]; + group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr; + + if (motion_blur) { + builtin_options.usesMotionBlur = true; + + optix_assert(optixBuiltinISModuleGet( + context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1])); + + group_descs[PG_HITD_MOTION] = group_descs[PG_HITD]; + group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1]; + group_descs[PG_HITS_MOTION] = group_descs[PG_HITS]; + group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1]; + } + } + else { + /* Custom ribbon intersection. */ + group_descs[PG_HITD].hitgroup.moduleIS = optix_module; + group_descs[PG_HITS].hitgroup.moduleIS = optix_module; + group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; + group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; + } + } + + if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) { + /* Add hit group for local intersections. */ + group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITL].hitgroup.moduleAH = optix_module; + group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit"; + } + + /* Shader raytracing replaces some functions with direct callables. */ + if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_surface_raytrace"; + group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module; + group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao"; + group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module; + group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC = + "__direct_callable__svm_node_bevel"; + group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module; + group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass"; + } + + optix_assert(optixProgramGroupCreate( + context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); + + /* Get program stack sizes. */ + OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {}; + /* Set up SBT, which in this case is used only to select between different programs. */ + sbt_data.alloc(NUM_PROGRAM_GROUPS); + memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS); + for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i])); + optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i])); + } + sbt_data.copy_to_device(); /* Upload SBT to device. */ + + /* Calculate maximum trace continuation stack size. */ + unsigned int trace_css = stack_size[PG_HITD].cssCH; + /* This is based on the maximum of closest-hit and any-hit/intersection programs. */ + trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH); + trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH); + trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH); + trace_css = std::max(trace_css, + stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH); + trace_css = std::max(trace_css, + stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH); + + OptixPipelineLinkOptions link_options = {}; + link_options.maxTraceDepth = 1; + + if (DebugFlags().optix.use_debug) { + link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; + } + else { + link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; + } + + if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + /* Create shader raytracing pipeline. */ + vector<OptixProgramGroup> pipeline_groups; + pipeline_groups.reserve(NUM_PROGRAM_GROUPS); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]); + pipeline_groups.push_back(groups[PG_MISS]); + pipeline_groups.push_back(groups[PG_HITD]); + pipeline_groups.push_back(groups[PG_HITS]); + pipeline_groups.push_back(groups[PG_HITL]); + if (motion_blur) { + pipeline_groups.push_back(groups[PG_HITD_MOTION]); + pipeline_groups.push_back(groups[PG_HITS_MOTION]); + } + pipeline_groups.push_back(groups[PG_CALL_SVM_AO]); + pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]); + + optix_assert(optixPipelineCreate(context, + &pipeline_options, + &link_options, + pipeline_groups.data(), + pipeline_groups.size(), + nullptr, + 0, + &pipelines[PIP_SHADE_RAYTRACE])); + + /* Combine ray generation and trace continuation stack size. */ + const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG + + link_options.maxTraceDepth * trace_css; + const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC, + stack_size[PG_CALL_SVM_BEVEL].dssDC); + + /* Set stack size depending on pipeline options. */ + optix_assert(optixPipelineSetStackSize( + pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2)); + } + + { /* Create intersection-only pipeline. */ + vector<OptixProgramGroup> pipeline_groups; + pipeline_groups.reserve(NUM_PROGRAM_GROUPS); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]); + pipeline_groups.push_back(groups[PG_MISS]); + pipeline_groups.push_back(groups[PG_HITD]); + pipeline_groups.push_back(groups[PG_HITS]); + pipeline_groups.push_back(groups[PG_HITL]); + if (motion_blur) { + pipeline_groups.push_back(groups[PG_HITD_MOTION]); + pipeline_groups.push_back(groups[PG_HITS_MOTION]); + } + + optix_assert(optixPipelineCreate(context, + &pipeline_options, + &link_options, + pipeline_groups.data(), + pipeline_groups.size(), + nullptr, + 0, + &pipelines[PIP_INTERSECT])); + + /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */ + const unsigned int css = + std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG, + std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG, + std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG, + stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) + + link_options.maxTraceDepth * trace_css; + + optix_assert( + optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2)); + } + + /* Clean up program group objects. */ + for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + optixProgramGroupDestroy(groups[i]); + } + + return true; +} + +/* -------------------------------------------------------------------- + * Buffer denoising. + */ + +class OptiXDevice::DenoiseContext { + public: + explicit DenoiseContext(OptiXDevice *device, const DeviceDenoiseTask &task) + : denoise_params(task.params), + render_buffers(task.render_buffers), + buffer_params(task.buffer_params), + guiding_buffer(device, "denoiser guiding passes buffer"), + num_samples(task.num_samples) + { + num_input_passes = 1; + if (denoise_params.use_pass_albedo) { + num_input_passes += 1; + use_pass_albedo = true; + pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO); + if (denoise_params.use_pass_normal) { + num_input_passes += 1; + use_pass_normal = true; + pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL); + } + } + + const int num_guiding_passes = num_input_passes - 1; + + if (num_guiding_passes) { + if (task.allow_inplace_modification) { + guiding_params.device_pointer = render_buffers->buffer.device_pointer; + + guiding_params.pass_albedo = pass_denoising_albedo; + guiding_params.pass_normal = pass_denoising_normal; + + guiding_params.stride = buffer_params.stride; + guiding_params.pass_stride = buffer_params.pass_stride; + } + else { + guiding_params.pass_stride = 0; + if (use_pass_albedo) { + guiding_params.pass_albedo = guiding_params.pass_stride; + guiding_params.pass_stride += 3; + } + if (use_pass_normal) { + guiding_params.pass_normal = guiding_params.pass_stride; + guiding_params.pass_stride += 3; + } + + guiding_params.stride = buffer_params.width; + + guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height * + guiding_params.pass_stride); + guiding_params.device_pointer = guiding_buffer.device_pointer; + } + } + + pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT); + } + + const DenoiseParams &denoise_params; + + RenderBuffers *render_buffers = nullptr; + const BufferParams &buffer_params; + + /* Device-side storage of the guiding passes. */ + device_only_memory<float> guiding_buffer; + + struct { + device_ptr device_pointer = 0; + + /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */ + int pass_albedo = PASS_UNUSED; + int pass_normal = PASS_UNUSED; + + int stride = -1; + int pass_stride = -1; + } guiding_params; + + /* Number of input passes. Including the color and extra auxiliary passes. */ + int num_input_passes = 0; + bool use_pass_albedo = false; + bool use_pass_normal = false; + + int num_samples = 0; + + int pass_sample_count = PASS_UNUSED; + + /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */ + int pass_denoising_albedo = PASS_UNUSED; + int pass_denoising_normal = PASS_UNUSED; + + /* For passes which don't need albedo channel for denoising we replace the actual albedo with + * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with + * the fake values and denoising of passes which do need albedo can no longer happen. */ + bool albedo_replaced_with_fake = false; +}; + +class OptiXDevice::DenoisePass { + public: + DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type) + { + noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY); + denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED); + + const PassInfo pass_info = Pass::get_info(type); + num_components = pass_info.num_components; + use_compositing = pass_info.use_compositing; + use_denoising_albedo = pass_info.use_denoising_albedo; + } + + PassType type; + + int noisy_offset; + int denoised_offset; + + int num_components; + bool use_compositing; + bool use_denoising_albedo; +}; + +bool OptiXDevice::denoise_buffer(const DeviceDenoiseTask &task) +{ + const CUDAContextScope scope(this); + + DenoiseContext context(this, task); + + if (!denoise_ensure(context)) { + return false; + } + + if (!denoise_filter_guiding_preprocess(context)) { + LOG(ERROR) << "Error preprocessing guiding passes."; + return false; + } + + /* Passes which will use real albedo when it is available. */ + denoise_pass(context, PASS_COMBINED); + denoise_pass(context, PASS_SHADOW_CATCHER_MATTE); + + /* Passes which do not need albedo and hence if real is present it needs to become fake. */ + denoise_pass(context, PASS_SHADOW_CATCHER); + + return true; +} + +DeviceQueue *OptiXDevice::get_denoise_queue() +{ + return &denoiser_.queue; +} + +bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context) +{ + const BufferParams &buffer_params = context.buffer_params; + + const int work_size = buffer_params.width * buffer_params.height; + + void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer), + const_cast<int *>(&context.guiding_params.pass_stride), + const_cast<int *>(&context.guiding_params.pass_albedo), + const_cast<int *>(&context.guiding_params.pass_normal), + &context.render_buffers->buffer.device_pointer, + const_cast<int *>(&buffer_params.offset), + const_cast<int *>(&buffer_params.stride), + const_cast<int *>(&buffer_params.pass_stride), + const_cast<int *>(&context.pass_sample_count), + const_cast<int *>(&context.pass_denoising_albedo), + const_cast<int *>(&context.pass_denoising_normal), + const_cast<int *>(&buffer_params.full_x), + const_cast<int *>(&buffer_params.full_y), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.height), + const_cast<int *>(&context.num_samples)}; + + return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args); +} + +bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context) +{ + const BufferParams &buffer_params = context.buffer_params; + + const int work_size = buffer_params.width * buffer_params.height; + + void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer), + const_cast<int *>(&context.guiding_params.pass_stride), + const_cast<int *>(&context.guiding_params.pass_albedo), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.height)}; + + return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args); +} + +void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type) +{ + const BufferParams &buffer_params = context.buffer_params; + + const DenoisePass pass(pass_type, buffer_params); + + if (pass.noisy_offset == PASS_UNUSED) { + return; + } + if (pass.denoised_offset == PASS_UNUSED) { + LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type); + return; + } + + if (pass.use_denoising_albedo) { + if (context.albedo_replaced_with_fake) { + LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set."; + return; + } + } + else if (!context.albedo_replaced_with_fake) { + context.albedo_replaced_with_fake = true; + if (!denoise_filter_guiding_set_fake_albedo(context)) { + LOG(ERROR) << "Error replacing real albedo with the fake one."; + return; + } + } + + /* Read and preprocess noisy color input pass. */ + denoise_color_read(context, pass); + if (!denoise_filter_color_preprocess(context, pass)) { + LOG(ERROR) << "Error connverting denoising passes to RGB buffer."; + return; + } + + if (!denoise_run(context, pass)) { + LOG(ERROR) << "Error running OptiX denoiser."; + return; + } + + /* Store result in the combined pass of the render buffer. + * + * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */ + if (!denoise_filter_color_postprocess(context, pass)) { + LOG(ERROR) << "Error copying denoiser result to the denoised pass."; + return; + } + + denoiser_.queue.synchronize(); +} + +void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass &pass) +{ + PassAccessor::PassAccessInfo pass_access_info; + pass_access_info.type = pass.type; + pass_access_info.mode = PassMode::NOISY; + pass_access_info.offset = pass.noisy_offset; + + /* Denoiser operates on passes which are used to calculate the approximation, and is never used + * on the approximation. The latter is not even possible because OptiX does not support + * denoising of semi-transparent pixels. */ + pass_access_info.use_approximate_shadow_catcher = false; + pass_access_info.use_approximate_shadow_catcher_background = false; + pass_access_info.show_active_pixels = false; + + /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases. + */ + const PassAccessorGPU pass_accessor( + &denoiser_.queue, pass_access_info, 1.0f, context.num_samples); + + PassAccessor::Destination destination(pass_access_info.type); + destination.d_pixels = context.render_buffers->buffer.device_pointer + + pass.denoised_offset * sizeof(float); + destination.num_components = 3; + destination.pixel_stride = context.buffer_params.pass_stride; + + pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination); +} + +bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass) +{ + const BufferParams &buffer_params = context.buffer_params; + + const int work_size = buffer_params.width * buffer_params.height; + + void *args[] = {&context.render_buffers->buffer.device_pointer, + const_cast<int *>(&buffer_params.full_x), + const_cast<int *>(&buffer_params.full_y), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.height), + const_cast<int *>(&buffer_params.offset), + const_cast<int *>(&buffer_params.stride), + const_cast<int *>(&buffer_params.pass_stride), + const_cast<int *>(&pass.denoised_offset)}; + + return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args); +} + +bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context, + const DenoisePass &pass) +{ + const BufferParams &buffer_params = context.buffer_params; + + const int work_size = buffer_params.width * buffer_params.height; + + void *args[] = {&context.render_buffers->buffer.device_pointer, + const_cast<int *>(&buffer_params.full_x), + const_cast<int *>(&buffer_params.full_y), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.height), + const_cast<int *>(&buffer_params.offset), + const_cast<int *>(&buffer_params.stride), + const_cast<int *>(&buffer_params.pass_stride), + const_cast<int *>(&context.num_samples), + const_cast<int *>(&pass.noisy_offset), + const_cast<int *>(&pass.denoised_offset), + const_cast<int *>(&context.pass_sample_count), + const_cast<int *>(&pass.num_components), + const_cast<bool *>(&pass.use_compositing)}; + + return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args); +} + +bool OptiXDevice::denoise_ensure(DenoiseContext &context) +{ + if (!denoise_create_if_needed(context)) { + LOG(ERROR) << "OptiX denoiser creation has failed."; + return false; + } + + if (!denoise_configure_if_needed(context)) { + LOG(ERROR) << "OptiX denoiser configuration has failed."; + return false; + } + + return true; +} + +bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context) +{ + const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) || + (denoiser_.use_pass_albedo != context.use_pass_albedo) || + (denoiser_.use_pass_normal != context.use_pass_normal); + if (!recreate_denoiser) { + return true; + } + + /* Destroy existing handle before creating new one. */ + if (denoiser_.optix_denoiser) { + optixDenoiserDestroy(denoiser_.optix_denoiser); + } + + /* Create OptiX denoiser handle on demand when it is first used. */ + OptixDenoiserOptions denoiser_options = {}; + denoiser_options.guideAlbedo = context.use_pass_albedo; + denoiser_options.guideNormal = context.use_pass_normal; + const OptixResult result = optixDenoiserCreate( + this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser); + + if (result != OPTIX_SUCCESS) { + set_error("Failed to create OptiX denoiser"); + return false; + } + + /* OptiX denoiser handle was created with the requested number of input passes. */ + denoiser_.use_pass_albedo = context.use_pass_albedo; + denoiser_.use_pass_normal = context.use_pass_normal; + + /* OptiX denoiser has been created, but it needs configuration. */ + denoiser_.is_configured = false; + + return true; +} + +bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context) +{ + if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width && + denoiser_.configured_size.y == context.buffer_params.height)) { + return true; + } + + const BufferParams &buffer_params = context.buffer_params; + + OptixDenoiserSizes sizes = {}; + optix_assert(optixDenoiserComputeMemoryResources( + denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes)); + + denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes; + denoiser_.scratch_offset = sizes.stateSizeInBytes; + + /* Allocate denoiser state if tile size has changed since last setup. */ + denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size); + + /* Initialize denoiser state for the current tile size. */ + const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser, + denoiser_.queue.stream(), + buffer_params.width, + buffer_params.height, + denoiser_.state.device_pointer, + denoiser_.scratch_offset, + denoiser_.state.device_pointer + + denoiser_.scratch_offset, + denoiser_.scratch_size); + if (result != OPTIX_SUCCESS) { + set_error("Failed to set up OptiX denoiser"); + return false; + } + + denoiser_.is_configured = true; + denoiser_.configured_size.x = buffer_params.width; + denoiser_.configured_size.y = buffer_params.height; + + return true; +} + +bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass) +{ + const BufferParams &buffer_params = context.buffer_params; + const int width = buffer_params.width; + const int height = buffer_params.height; + + /* Set up input and output layer information. */ + OptixImage2D color_layer = {0}; + OptixImage2D albedo_layer = {0}; + OptixImage2D normal_layer = {0}; + + OptixImage2D output_layer = {0}; + + /* Color pass. */ + { + const int pass_denoised = pass.denoised_offset; + const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float); + + color_layer.data = context.render_buffers->buffer.device_pointer + + pass_denoised * sizeof(float); + color_layer.width = width; + color_layer.height = height; + color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride; + color_layer.pixelStrideInBytes = pass_stride_in_bytes; + color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3; + } + + device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE); + + /* Optional albedo and color passes. */ + if (context.num_input_passes > 1) { + const device_ptr d_guiding_buffer = context.guiding_params.device_pointer; + const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float); + const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes; + + if (context.use_pass_albedo) { + albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float); + albedo_layer.width = width; + albedo_layer.height = height; + albedo_layer.rowStrideInBytes = row_stride_in_bytes; + albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes; + albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3; + } + + if (context.use_pass_normal) { + normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float); + normal_layer.width = width; + normal_layer.height = height; + normal_layer.rowStrideInBytes = row_stride_in_bytes; + normal_layer.pixelStrideInBytes = pixel_stride_in_bytes; + normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3; + } + } + + /* Denoise in-place of the noisy input in the render buffers. */ + output_layer = color_layer; + + /* Finally run denoising. */ + OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */ + OptixDenoiserLayer image_layers = {}; + image_layers.input = color_layer; + image_layers.output = output_layer; + + OptixDenoiserGuideLayer guide_layers = {}; + guide_layers.albedo = albedo_layer; + guide_layers.normal = normal_layer; + + optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser, + denoiser_.queue.stream(), + ¶ms, + denoiser_.state.device_pointer, + denoiser_.scratch_offset, + &guide_layers, + &image_layers, + 1, + 0, + 0, + denoiser_.state.device_pointer + denoiser_.scratch_offset, + denoiser_.scratch_size)); + + return true; +} + +bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh, + OptixBuildOperation operation, + const OptixBuildInput &build_input, + uint16_t num_motion_steps) +{ + const CUDAContextScope scope(this); + + const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC); + + /* Compute memory usage. */ + OptixAccelBufferSizes sizes = {}; + OptixAccelBuildOptions options = {}; + options.operation = operation; + if (use_fast_trace_bvh) { + VLOG(2) << "Using fast to trace OptiX BVH"; + options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION; + } + else { + VLOG(2) << "Using fast to update OptiX BVH"; + options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE; + } + + options.motionOptions.numKeys = num_motion_steps; + options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH; + options.motionOptions.timeBegin = 0.0f; + options.motionOptions.timeEnd = 1.0f; + + optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); + + /* Allocate required output buffers. */ + device_only_memory<char> temp_mem(this, "optix temp as build mem"); + temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8); + if (!temp_mem.device_pointer) { + /* Make sure temporary memory allocation succeeded. */ + return false; + } + + device_only_memory<char> &out_data = bvh->as_data; + if (operation == OPTIX_BUILD_OPERATION_BUILD) { + assert(out_data.device == this); + out_data.alloc_to_device(sizes.outputSizeInBytes); + if (!out_data.device_pointer) { + return false; + } + } + else { + assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes); + } + + /* Finally build the acceleration structure. */ + OptixAccelEmitDesc compacted_size_prop = {}; + compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE; + /* A tiny space was allocated for this property at the end of the temporary buffer above. + * Make sure this pointer is 8-byte aligned. */ + compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8); + + OptixTraversableHandle out_handle = 0; + optix_assert(optixAccelBuild(context, + NULL, + &options, + &build_input, + 1, + temp_mem.device_pointer, + sizes.tempSizeInBytes, + out_data.device_pointer, + sizes.outputSizeInBytes, + &out_handle, + use_fast_trace_bvh ? &compacted_size_prop : NULL, + use_fast_trace_bvh ? 1 : 0)); + bvh->traversable_handle = static_cast<uint64_t>(out_handle); + + /* Wait for all operations to finish. */ + cuda_assert(cuStreamSynchronize(NULL)); + + /* Compact acceleration structure to save memory (do not do this in viewport for faster builds). + */ + if (use_fast_trace_bvh) { + uint64_t compacted_size = sizes.outputSizeInBytes; + cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size))); + + /* Temporary memory is no longer needed, so free it now to make space. */ + temp_mem.free(); + + /* There is no point compacting if the size does not change. */ + if (compacted_size < sizes.outputSizeInBytes) { + device_only_memory<char> compacted_data(this, "optix compacted as"); + compacted_data.alloc_to_device(compacted_size); + if (!compacted_data.device_pointer) + /* Do not compact if memory allocation for compacted acceleration structure fails. + * Can just use the uncompacted one then, so succeed here regardless. */ + return !have_error(); + + optix_assert(optixAccelCompact( + context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle)); + bvh->traversable_handle = static_cast<uint64_t>(out_handle); + + /* Wait for compaction to finish. */ + cuda_assert(cuStreamSynchronize(NULL)); + + std::swap(out_data.device_size, compacted_data.device_size); + std::swap(out_data.device_pointer, compacted_data.device_pointer); + } + } + + return !have_error(); +} + +void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) +{ + const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC); + + free_bvh_memory_delayed(); + + BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh); + + progress.set_substatus("Building OptiX acceleration structure"); + + if (!bvh->params.top_level) { + assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1); + + /* Refit is only possible in viewport for now (because AS is built with + * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */ + OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD; + if (refit && !use_fast_trace_bvh) { + assert(bvh_optix->traversable_handle != 0); + operation = OPTIX_BUILD_OPERATION_UPDATE; + } + else { + bvh_optix->as_data.free(); + bvh_optix->traversable_handle = 0; + } + + /* Build bottom level acceleration structures (BLAS). */ + Geometry *const geom = bvh->geometry[0]; + if (geom->geometry_type == Geometry::HAIR) { + /* Build BLAS for curve primitives. */ + Hair *const hair = static_cast<Hair *const>(geom); + if (hair->num_curves() == 0) { + return; + } + + const size_t num_segments = hair->num_segments(); + + size_t num_motion_steps = 1; + Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if (motion_blur && hair->get_use_motion_blur() && motion_keys) { + num_motion_steps = hair->get_motion_steps(); + } + + device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY); + device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); + device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); + /* Four control points for each curve segment. */ + const size_t num_vertices = num_segments * 4; + if (hair->curve_shape == CURVE_THICK) { + index_data.alloc(num_segments); + vertex_data.alloc(num_vertices * num_motion_steps); + } + else + aabb_data.alloc(num_segments * num_motion_steps); + + /* Get AABBs for each motion step. */ + for (size_t step = 0; step < num_motion_steps; ++step) { + /* The center step for motion vertices is not stored in the attribute. */ + const float3 *keys = hair->get_curve_keys().data(); + size_t center_step = (num_motion_steps - 1) / 2; + if (step != center_step) { + size_t attr_offset = (step > center_step) ? step - 1 : step; + /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */ + keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size(); + } + + for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) { + const Hair::Curve curve = hair->get_curve(j); + const array<float> &curve_radius = hair->get_curve_radius(); + + for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) { + if (hair->curve_shape == CURVE_THICK) { + int k0 = curve.first_key + segment; + int k1 = k0 + 1; + int ka = max(k0 - 1, curve.first_key); + int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1); + + const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x); + const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y); + const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z); + const float4 pw = make_float4( + curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]); + + /* Convert Catmull-Rom data to Bezier spline. */ + static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f; + static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f; + static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f; + static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f; + + index_data[i] = i * 4; + float4 *const v = vertex_data.data() + step * num_vertices + index_data[i]; + v[0] = make_float4( + dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw)); + v[1] = make_float4( + dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw)); + v[2] = make_float4( + dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw)); + v[3] = make_float4( + dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw)); + } + else { + BoundBox bounds = BoundBox::empty; + curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds); + + const size_t index = step * num_segments + i; + aabb_data[index].minX = bounds.min.x; + aabb_data[index].minY = bounds.min.y; + aabb_data[index].minZ = bounds.min.z; + aabb_data[index].maxX = bounds.max.x; + aabb_data[index].maxY = bounds.max.y; + aabb_data[index].maxZ = bounds.max.z; + } + } + } + } + + /* Upload AABB data to GPU. */ + aabb_data.copy_to_device(); + index_data.copy_to_device(); + vertex_data.copy_to_device(); + + vector<device_ptr> aabb_ptrs; + aabb_ptrs.reserve(num_motion_steps); + vector<device_ptr> width_ptrs; + vector<device_ptr> vertex_ptrs; + width_ptrs.reserve(num_motion_steps); + vertex_ptrs.reserve(num_motion_steps); + for (size_t step = 0; step < num_motion_steps; ++step) { + aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb)); + const device_ptr base_ptr = vertex_data.device_pointer + + step * num_vertices * sizeof(float4); + width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */ + vertex_ptrs.push_back(base_ptr); + } + + /* Force a single any-hit call, so shadow record-all behavior works correctly. */ + unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; + OptixBuildInput build_input = {}; + if (hair->curve_shape == CURVE_THICK) { + build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES; + build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; + build_input.curveArray.numPrimitives = num_segments; + build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); + build_input.curveArray.numVertices = num_vertices; + build_input.curveArray.vertexStrideInBytes = sizeof(float4); + build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data(); + build_input.curveArray.widthStrideInBytes = sizeof(float4); + build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer; + build_input.curveArray.indexStrideInBytes = sizeof(int); + build_input.curveArray.flag = build_flags; + build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset; + } + else { + /* Disable visibility test any-hit program, since it is already checked during + * intersection. Those trace calls that require anyhit can force it with a ray flag. */ + build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT; + + build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; + build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); + build_input.customPrimitiveArray.numPrimitives = num_segments; + build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb); + build_input.customPrimitiveArray.flags = &build_flags; + build_input.customPrimitiveArray.numSbtRecords = 1; + build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset; + } + + if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { + progress.set_error("Failed to build OptiX acceleration structure"); + } + } + else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) { + /* Build BLAS for triangle primitives. */ + Mesh *const mesh = static_cast<Mesh *const>(geom); + if (mesh->num_triangles() == 0) { + return; + } + + const size_t num_verts = mesh->get_verts().size(); + + size_t num_motion_steps = 1; + Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if (motion_blur && mesh->get_use_motion_blur() && motion_keys) { + num_motion_steps = mesh->get_motion_steps(); + } + + device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); + index_data.alloc(mesh->get_triangles().size()); + memcpy(index_data.data(), + mesh->get_triangles().data(), + mesh->get_triangles().size() * sizeof(int)); + device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); + vertex_data.alloc(num_verts * num_motion_steps); + + for (size_t step = 0; step < num_motion_steps; ++step) { + const float3 *verts = mesh->get_verts().data(); + + size_t center_step = (num_motion_steps - 1) / 2; + /* The center step for motion vertices is not stored in the attribute. */ + if (step != center_step) { + verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts; + } + + memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3)); + } + + /* Upload triangle data to GPU. */ + index_data.copy_to_device(); + vertex_data.copy_to_device(); + + vector<device_ptr> vertex_ptrs; + vertex_ptrs.reserve(num_motion_steps); + for (size_t step = 0; step < num_motion_steps; ++step) { + vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3)); + } + + /* Force a single any-hit call, so shadow record-all behavior works correctly. */ + unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; + OptixBuildInput build_input = {}; + build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES; + build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); + build_input.triangleArray.numVertices = num_verts; + build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3; + build_input.triangleArray.vertexStrideInBytes = sizeof(float4); + build_input.triangleArray.indexBuffer = index_data.device_pointer; + build_input.triangleArray.numIndexTriplets = mesh->num_triangles(); + build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3; + build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int); + build_input.triangleArray.flags = &build_flags; + /* The SBT does not store per primitive data since Cycles already allocates separate + * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in + * one and rely on that having the same meaning in this case. */ + build_input.triangleArray.numSbtRecords = 1; + build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset; + + if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { + progress.set_error("Failed to build OptiX acceleration structure"); + } + } + } + else { + unsigned int num_instances = 0; + unsigned int max_num_instances = 0xFFFFFFFF; + + bvh_optix->as_data.free(); + bvh_optix->traversable_handle = 0; + bvh_optix->motion_transform_data.free(); + + optixDeviceContextGetProperty(context, + OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID, + &max_num_instances, + sizeof(max_num_instances)); + /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */ + max_num_instances >>= 1; + if (bvh->objects.size() > max_num_instances) { + progress.set_error( + "Failed to build OptiX acceleration structure because there are too many instances"); + return; + } + + /* Fill instance descriptions. */ + device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY); + instances.alloc(bvh->objects.size()); + + /* Calculate total motion transform size and allocate memory for them. */ + size_t motion_transform_offset = 0; + if (motion_blur) { + size_t total_motion_transform_size = 0; + for (Object *const ob : bvh->objects) { + if (ob->is_traceable() && ob->use_motion()) { + total_motion_transform_size = align_up(total_motion_transform_size, + OPTIX_TRANSFORM_BYTE_ALIGNMENT); + const size_t motion_keys = max(ob->get_motion().size(), 2) - 2; + total_motion_transform_size = total_motion_transform_size + + sizeof(OptixSRTMotionTransform) + + motion_keys * sizeof(OptixSRTData); + } + } + + assert(bvh_optix->motion_transform_data.device == this); + bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size); + } + + for (Object *ob : bvh->objects) { + /* Skip non-traceable objects. */ + if (!ob->is_traceable()) { + continue; + } + + BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh); + OptixTraversableHandle handle = blas->traversable_handle; + + OptixInstance &instance = instances[num_instances++]; + memset(&instance, 0, sizeof(instance)); + + /* Clear transform to identity matrix. */ + instance.transform[0] = 1.0f; + instance.transform[5] = 1.0f; + instance.transform[10] = 1.0f; + + /* Set user instance ID to object index (but leave low bit blank). */ + instance.instanceId = ob->get_device_index() << 1; + + /* Have to have at least one bit in the mask, or else instance would always be culled. */ + instance.visibilityMask = 1; + + if (ob->get_geometry()->has_volume) { + /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes. + */ + instance.visibilityMask |= 2; + } + + if (ob->get_geometry()->geometry_type == Geometry::HAIR) { + /* Same applies to curves (so they can be skipped in local trace calls). */ + instance.visibilityMask |= 4; + + if (motion_blur && ob->get_geometry()->has_motion_blur() && + static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { + /* Select between motion blur and non-motion blur built-in intersection module. */ + instance.sbtOffset = PG_HITD_MOTION - PG_HITD; + } + } + + /* Insert motion traversable if object has motion. */ + if (motion_blur && ob->use_motion()) { + size_t motion_keys = max(ob->get_motion().size(), 2) - 2; + size_t motion_transform_size = sizeof(OptixSRTMotionTransform) + + motion_keys * sizeof(OptixSRTData); + + const CUDAContextScope scope(this); + + motion_transform_offset = align_up(motion_transform_offset, + OPTIX_TRANSFORM_BYTE_ALIGNMENT); + CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer + + motion_transform_offset; + motion_transform_offset += motion_transform_size; + + /* Allocate host side memory for motion transform and fill it with transform data. */ + OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>( + new uint8_t[motion_transform_size]); + motion_transform.child = handle; + motion_transform.motionOptions.numKeys = ob->get_motion().size(); + motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE; + motion_transform.motionOptions.timeBegin = 0.0f; + motion_transform.motionOptions.timeEnd = 1.0f; + + OptixSRTData *const srt_data = motion_transform.srtData; + array<DecomposedTransform> decomp(ob->get_motion().size()); + transform_motion_decompose( + decomp.data(), ob->get_motion().data(), ob->get_motion().size()); + + for (size_t i = 0; i < ob->get_motion().size(); ++i) { + /* Scale. */ + srt_data[i].sx = decomp[i].y.w; /* scale.x.x */ + srt_data[i].sy = decomp[i].z.w; /* scale.y.y */ + srt_data[i].sz = decomp[i].w.w; /* scale.z.z */ + + /* Shear. */ + srt_data[i].a = decomp[i].z.x; /* scale.x.y */ + srt_data[i].b = decomp[i].z.y; /* scale.x.z */ + srt_data[i].c = decomp[i].w.x; /* scale.y.z */ + assert(decomp[i].z.z == 0.0f); /* scale.y.x */ + assert(decomp[i].w.y == 0.0f); /* scale.z.x */ + assert(decomp[i].w.z == 0.0f); /* scale.z.y */ + + /* Pivot point. */ + srt_data[i].pvx = 0.0f; + srt_data[i].pvy = 0.0f; + srt_data[i].pvz = 0.0f; + + /* Rotation. */ + srt_data[i].qx = decomp[i].x.x; + srt_data[i].qy = decomp[i].x.y; + srt_data[i].qz = decomp[i].x.z; + srt_data[i].qw = decomp[i].x.w; + + /* Translation. */ + srt_data[i].tx = decomp[i].y.x; + srt_data[i].ty = decomp[i].y.y; + srt_data[i].tz = decomp[i].y.z; + } + + /* Upload motion transform to GPU. */ + cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size); + delete[] reinterpret_cast<uint8_t *>(&motion_transform); + + /* Disable instance transform if object uses motion transform already. */ + instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; + + /* Get traversable handle to motion transform. */ + optixConvertPointerToTraversableHandle(context, + motion_transform_gpu, + OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM, + &instance.traversableHandle); + } + else { + instance.traversableHandle = handle; + + if (ob->get_geometry()->is_instanced()) { + /* Set transform matrix. */ + memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform)); + } + else { + /* Disable instance transform if geometry already has it applied to vertex data. */ + instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; + /* Non-instanced objects read ID from 'prim_object', so distinguish + * them from instanced objects with the low bit set. */ + instance.instanceId |= 1; + } + } + } + + /* Upload instance descriptions. */ + instances.resize(num_instances); + instances.copy_to_device(); + + /* Build top-level acceleration structure (TLAS) */ + OptixBuildInput build_input = {}; + build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES; + build_input.instanceArray.instances = instances.device_pointer; + build_input.instanceArray.numInstances = num_instances; + + if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) { + progress.set_error("Failed to build OptiX acceleration structure"); + } + tlas_handle = bvh_optix->traversable_handle; + } +} + +void OptiXDevice::release_optix_bvh(BVH *bvh) +{ + thread_scoped_lock lock(delayed_free_bvh_mutex); + /* Do delayed free of BVH memory, since geometry holding BVH might be deleted + * while GPU is still rendering. */ + BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh); + + delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data)); + delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data)); + bvh_optix->traversable_handle = 0; +} + +void OptiXDevice::free_bvh_memory_delayed() +{ + thread_scoped_lock lock(delayed_free_bvh_mutex); + delayed_free_bvh_memory.free_memory(); +} + +void OptiXDevice::const_copy_to(const char *name, void *host, size_t size) +{ + /* Set constant memory for CUDA module. */ + CUDADevice::const_copy_to(name, host, size); + + if (strcmp(name, "__data") == 0) { + assert(size <= sizeof(KernelData)); + + /* Update traversable handle (since it is different for each device on multi devices). */ + KernelData *const data = (KernelData *)host; + *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; + + update_launch_params(offsetof(KernelParamsOptiX, data), host, size); + return; + } + + /* Update data storage pointers in launch parameters. */ +# define KERNEL_TEX(data_type, tex_name) \ + if (strcmp(name, #tex_name) == 0) { \ + update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \ + return; \ + } + KERNEL_TEX(IntegratorStateGPU, __integrator_state) +# include "kernel/kernel_textures.h" +# undef KERNEL_TEX +} + +void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size) +{ + const CUDAContextScope scope(this); + + cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size)); +} + +CCL_NAMESPACE_END + +#endif /* WITH_OPTIX */ diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h new file mode 100644 index 00000000000..91ef52e0a5a --- /dev/null +++ b/intern/cycles/device/optix/device_impl.h @@ -0,0 +1,186 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_OPTIX + +# include "device/cuda/device_impl.h" +# include "device/optix/queue.h" +# include "device/optix/util.h" +# include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +class BVHOptiX; +struct KernelParamsOptiX; + +/* List of OptiX program groups. */ +enum { + PG_RGEN_INTERSECT_CLOSEST, + PG_RGEN_INTERSECT_SHADOW, + PG_RGEN_INTERSECT_SUBSURFACE, + PG_RGEN_INTERSECT_VOLUME_STACK, + PG_RGEN_SHADE_SURFACE_RAYTRACE, + PG_MISS, + PG_HITD, /* Default hit group. */ + PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */ + PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */ + PG_HITD_MOTION, + PG_HITS_MOTION, + PG_CALL_SVM_AO, + PG_CALL_SVM_BEVEL, + PG_CALL_AO_PASS, + NUM_PROGRAM_GROUPS +}; + +static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS; +static const int NUM_MIS_PROGRAM_GROUPS = 1; +static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD; +static const int NUM_HIT_PROGRAM_GROUPS = 5; +static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO; +static const int NUM_CALLABLE_PROGRAM_GROUPS = 3; + +/* List of OptiX pipelines. */ +enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES }; + +/* A single shader binding table entry. */ +struct SbtRecord { + char header[OPTIX_SBT_RECORD_HEADER_SIZE]; +}; + +class OptiXDevice : public CUDADevice { + public: + OptixDeviceContext context = NULL; + + OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */ + OptixModule builtin_modules[2] = {}; + OptixPipeline pipelines[NUM_PIPELINES] = {}; + + bool motion_blur = false; + device_vector<SbtRecord> sbt_data; + device_only_memory<KernelParamsOptiX> launch_params; + OptixTraversableHandle tlas_handle = 0; + + vector<device_only_memory<char>> delayed_free_bvh_memory; + thread_mutex delayed_free_bvh_mutex; + + class Denoiser { + public: + explicit Denoiser(OptiXDevice *device); + ~Denoiser(); + + OptiXDevice *device; + OptiXDeviceQueue queue; + + OptixDenoiser optix_denoiser = nullptr; + + /* Configuration size, as provided to `optixDenoiserSetup`. + * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the + * `is_configured` will be false. */ + bool is_configured = false; + int2 configured_size = make_int2(0, 0); + + /* OptiX denoiser state and scratch buffers, stored in a single memory buffer. + * The memory layout goes as following: [denoiser state][scratch buffer]. */ + device_only_memory<unsigned char> state; + size_t scratch_offset = 0; + size_t scratch_size = 0; + + bool use_pass_albedo = false; + bool use_pass_normal = false; + }; + Denoiser denoiser_; + + public: + OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); + ~OptiXDevice(); + + private: + BVHLayoutMask get_bvh_layout_mask() const override; + + string compile_kernel_get_common_cflags(const uint kernel_features) override; + + bool load_kernels(const uint kernel_features) override; + + bool build_optix_bvh(BVHOptiX *bvh, + OptixBuildOperation operation, + const OptixBuildInput &build_input, + uint16_t num_motion_steps); + + void build_bvh(BVH *bvh, Progress &progress, bool refit) override; + + void release_optix_bvh(BVH *bvh) override; + void free_bvh_memory_delayed(); + + void const_copy_to(const char *name, void *host, size_t size) override; + + void update_launch_params(size_t offset, void *data, size_t data_size); + + virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + + /* -------------------------------------------------------------------- + * Denoising. + */ + + class DenoiseContext; + class DenoisePass; + + virtual bool denoise_buffer(const DeviceDenoiseTask &task) override; + virtual DeviceQueue *get_denoise_queue() override; + + /* Read guiding passes from the render buffers, preprocess them in a way which is expected by + * OptiX and store in the guiding passes memory within the given context. + * + * Pre=-processing of the guiding passes is to only happen once per context lifetime. DO not + * preprocess them for every pass which is being denoised. */ + bool denoise_filter_guiding_preprocess(DenoiseContext &context); + + /* Set fake albedo pixels in the albedo guiding pass storage. + * After this point only passes which do not need albedo for denoising can be processed. */ + bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context); + + void denoise_pass(DenoiseContext &context, PassType pass_type); + + /* Read input color pass from the render buffer into the memory which corresponds to the noisy + * input within the given context. Pixels are scaled to the number of samples, but are not + * preprocessed yet. */ + void denoise_color_read(DenoiseContext &context, const DenoisePass &pass); + + /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the + * denoiser result to the render buffer. */ + bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass); + bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass); + + /* Make sure the OptiX denoiser is created and configured. */ + bool denoise_ensure(DenoiseContext &context); + + /* Create OptiX denoiser descriptor if needed. + * Will do nothing if the current OptiX descriptor is usable for the given parameters. + * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */ + bool denoise_create_if_needed(DenoiseContext &context); + + /* Configure existing OptiX denoiser descriptor for the use for the given task. */ + bool denoise_configure_if_needed(DenoiseContext &context); + + /* Run configured denoiser. */ + bool denoise_run(DenoiseContext &context, const DenoisePass &pass); +}; + +CCL_NAMESPACE_END + +#endif /* WITH_OPTIX */ diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp new file mode 100644 index 00000000000..458ed70baa8 --- /dev/null +++ b/intern/cycles/device/optix/queue.cpp @@ -0,0 +1,144 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_OPTIX + +# include "device/optix/queue.h" +# include "device/optix/device_impl.h" + +# include "util/util_time.h" + +# undef __KERNEL_CPU__ +# define __KERNEL_OPTIX__ +# include "kernel/device/optix/globals.h" + +CCL_NAMESPACE_BEGIN + +/* CUDADeviceQueue */ + +OptiXDeviceQueue::OptiXDeviceQueue(OptiXDevice *device) : CUDADeviceQueue(device) +{ +} + +void OptiXDeviceQueue::init_execution() +{ + CUDADeviceQueue::init_execution(); +} + +static bool is_optix_specific_kernel(DeviceKernel kernel) +{ + return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); +} + +bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +{ + if (!is_optix_specific_kernel(kernel)) { + return CUDADeviceQueue::enqueue(kernel, work_size, args); + } + + if (cuda_device_->have_error()) { + return false; + } + + debug_enqueue(kernel, work_size); + + const CUDAContextScope scope(cuda_device_); + + OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_); + + const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer; + const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer; + + cuda_device_assert( + cuda_device_, + cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array), + args[0], // &d_path_index + sizeof(device_ptr), + cuda_stream_)); + + if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + cuda_device_assert( + cuda_device_, + cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer), + args[1], // &d_render_buffer + sizeof(device_ptr), + cuda_stream_)); + } + + cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); + + OptixPipeline pipeline = nullptr; + OptixShaderBindingTable sbt_params = {}; + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + pipeline = optix_device->pipelines[PIP_INTERSECT]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + pipeline = optix_device->pipelines[PIP_INTERSECT]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SHADOW * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + pipeline = optix_device->pipelines[PIP_INTERSECT]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SUBSURFACE * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: + pipeline = optix_device->pipelines[PIP_INTERSECT]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord); + break; + + default: + LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel) + << " is attempted to be enqueued."; + return false; + } + + sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord); + sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); + sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS; + sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord); + sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); + sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS; + sbt_params.callablesRecordBase = sbt_data_ptr + CALLABLE_PROGRAM_GROUPS_BASE * sizeof(SbtRecord); + sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS; + sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); + + /* Launch the ray generation program. */ + optix_device_assert(optix_device, + optixLaunch(pipeline, + cuda_stream_, + launch_params_ptr, + optix_device->launch_params.data_elements, + &sbt_params, + work_size, + 1, + 1)); + + return !(optix_device->have_error()); +} + +CCL_NAMESPACE_END + +#endif /* WITH_OPTIX */ diff --git a/intern/cycles/device/optix/queue.h b/intern/cycles/device/optix/queue.h new file mode 100644 index 00000000000..0de422ccc71 --- /dev/null +++ b/intern/cycles/device/optix/queue.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_OPTIX + +# include "device/cuda/queue.h" + +CCL_NAMESPACE_BEGIN + +class OptiXDevice; + +/* Base class for CUDA queues. */ +class OptiXDeviceQueue : public CUDADeviceQueue { + public: + OptiXDeviceQueue(OptiXDevice *device); + + virtual void init_execution() override; + + virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_OPTIX */ diff --git a/intern/cycles/device/optix/util.h b/intern/cycles/device/optix/util.h new file mode 100644 index 00000000000..34ae5bb5609 --- /dev/null +++ b/intern/cycles/device/optix/util.h @@ -0,0 +1,45 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_OPTIX + +# include "device/cuda/util.h" + +# ifdef WITH_CUDA_DYNLOAD +# include <cuew.h> +// Do not use CUDA SDK headers when using CUEW +# define OPTIX_DONT_INCLUDE_CUDA +# endif + +# include <optix_stubs.h> + +/* Utility for checking return values of OptiX function calls. */ +# define optix_device_assert(optix_device, stmt) \ + { \ + OptixResult result = stmt; \ + if (result != OPTIX_SUCCESS) { \ + const char *name = optixGetErrorName(result); \ + optix_device->set_error( \ + string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \ + } \ + } \ + (void)0 + +# define optix_assert(stmt) optix_device_assert(this, stmt) + +#endif /* WITH_OPTIX */ |