diff options
Diffstat (limited to 'intern/cycles/device/cpu/device_impl.cpp')
-rw-r--r-- | intern/cycles/device/cpu/device_impl.cpp | 481 |
1 files changed, 481 insertions, 0 deletions
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp new file mode 100644 index 00000000000..3b0db6bdd0e --- /dev/null +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -0,0 +1,481 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/cpu/device_impl.h" + +#include <stdlib.h> +#include <string.h> + +/* So ImathMath is included before our kernel_cpu_compat. */ +#ifdef WITH_OSL +/* So no context pollution happens from indirectly included windows.h */ +# include "util/util_windows.h" +# include <OSL/oslexec.h> +#endif + +#ifdef WITH_EMBREE +# include <embree3/rtcore.h> +#endif + +#include "device/cpu/kernel.h" +#include "device/cpu/kernel_thread_globals.h" + +#include "device/device.h" + +// clang-format off +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" +#include "kernel/device/cpu/kernel.h" +#include "kernel/kernel_types.h" + +#include "kernel/osl/osl_shader.h" +#include "kernel/osl/osl_globals.h" +// clang-format on + +#include "bvh/bvh_embree.h" + +#include "render/buffers.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_opengl.h" +#include "util/util_openimagedenoise.h" +#include "util/util_optimization.h" +#include "util/util_progress.h" +#include "util/util_system.h" +#include "util/util_task.h" +#include "util/util_thread.h" + +CCL_NAMESPACE_BEGIN + +CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_) + : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + /* Pick any kernel, all of them are supposed to have same level of microarchitecture + * optimization. */ + VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name() + << " kernels."; + + if (info.cpu_threads == 0) { + info.cpu_threads = TaskScheduler::num_threads(); + } + +#ifdef WITH_OSL + kernel_globals.osl = &osl_globals; +#endif +#ifdef WITH_EMBREE + embree_device = rtcNewDevice("verbose=0"); +#endif + need_texture_info = false; +} + +CPUDevice::~CPUDevice() +{ +#ifdef WITH_EMBREE + rtcReleaseDevice(embree_device); +#endif + + texture_info.free(); +} + +bool CPUDevice::show_samples() const +{ + return (info.cpu_threads == 1); +} + +BVHLayoutMask CPUDevice::get_bvh_layout_mask() const +{ + BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; +#ifdef WITH_EMBREE + bvh_layout_mask |= BVH_LAYOUT_EMBREE; +#endif /* WITH_EMBREE */ + return bvh_layout_mask; +} + +bool CPUDevice::load_texture_info() +{ + if (!need_texture_info) { + return false; + } + + texture_info.copy_to_device(); + need_texture_info = false; + + return true; +} + +void CPUDevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + if (mem.type == MEM_DEVICE_ONLY) { + assert(!mem.host_pointer); + size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES; + void *data = util_aligned_malloc(mem.memory_size(), alignment); + mem.device_pointer = (device_ptr)data; + } + else { + mem.device_pointer = (device_ptr)mem.host_pointer; + } + + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + } +} + +void CPUDevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + mem_alloc(mem); + } + + /* copy is no-op */ + } +} + +void CPUDevice::mem_copy_from( + device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) +{ + /* no-op */ +} + +void CPUDevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + + if (mem.device_pointer) { + memset((void *)mem.device_pointer, 0, mem.memory_size()); + } +} + +void CPUDevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else if (mem.device_pointer) { + if (mem.type == MEM_DEVICE_ONLY) { + util_aligned_free((void *)mem.device_pointer); + } + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } +} + +device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) +{ + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); +} + +void CPUDevice::const_copy_to(const char *name, void *host, size_t size) +{ +#if WITH_EMBREE + if (strcmp(name, "__data") == 0) { + assert(size <= sizeof(KernelData)); + + // Update scene handle (since it is different for each device on multi devices) + KernelData *const data = (KernelData *)host; + data->bvh.scene = embree_scene; + } +#endif + kernel_const_copy(&kernel_globals, name, host, size); +} + +void CPUDevice::global_alloc(device_memory &mem) +{ + VLOG(1) << "Global memory allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size); + + mem.device_pointer = (device_ptr)mem.host_pointer; + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); +} + +void CPUDevice::global_free(device_memory &mem) +{ + if (mem.device_pointer) { + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } +} + +void CPUDevice::tex_alloc(device_texture &mem) +{ + VLOG(1) << "Texture allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + mem.device_pointer = (device_ptr)mem.host_pointer; + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount of re-allocations. */ + texture_info.resize(slot + 128); + } + + texture_info[slot] = mem.info; + texture_info[slot].data = (uint64_t)mem.host_pointer; + need_texture_info = true; +} + +void CPUDevice::tex_free(device_texture &mem) +{ + if (mem.device_pointer) { + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + need_texture_info = true; + } +} + +void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) +{ +#ifdef WITH_EMBREE + if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE || + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) { + BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh); + if (refit) { + bvh_embree->refit(progress); + } + else { + bvh_embree->build(progress, &stats, embree_device); + } + + if (bvh->params.top_level) { + embree_scene = bvh_embree->scene; + } + } + else +#endif + Device::build_bvh(bvh, progress, refit); +} + +#if 0 +void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) +{ + const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; + + scoped_timer timer(&tile.buffers->render_time); + + Coverage coverage(kg, tile); + if (use_coverage) { + coverage.init_path_trace(); + } + + float *render_buffer = (float *)tile.buffer; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + + /* Needed for Embree. */ + SIMD_SET_FLUSH_TO_ZERO; + + for (int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || TaskPool::canceled()) { + if (task.need_finish_queue == false) + break; + } + + if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) { + tile.stealing_state = RenderTile::WAS_STOLEN; + break; + } + + if (tile.task == RenderTile::PATH_TRACE) { + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + if (use_coverage) { + coverage.init_pixel(x, y); + } + kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride); + } + } + } + else { + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride); + } + } + } + tile.sample = sample + 1; + + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) { + const bool stop = adaptive_sampling_filter(kg, tile, sample); + if (stop) { + const int num_progress_samples = end_sample - sample; + tile.sample = end_sample; + task.update_progress(&tile, tile.w * tile.h * num_progress_samples); + break; + } + } + + task.update_progress(&tile, tile.w * tile.h); + } + if (use_coverage) { + coverage.finalize(); + } + + if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) { + adaptive_sampling_post(tile, kg); + } +} + +void CPUDevice::thread_render(DeviceTask &task) +{ + if (TaskPool::canceled()) { + if (task.need_finish_queue == false) + return; + } + + /* allocate buffer for kernel globals */ + CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory()); + + profiler.add_state(&kg.profiler); + + /* NLM denoiser. */ + DenoisingTask *denoising = NULL; + + /* OpenImageDenoise: we can only denoise with one thread at a time, so to + * avoid waiting with mutex locks in the denoiser, we let only a single + * thread acquire denoising tiles. */ + uint tile_types = task.tile_types; + bool hold_denoise_lock = false; + if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + if (!oidn_task_lock.try_lock()) { + tile_types &= ~RenderTile::DENOISE; + hold_denoise_lock = true; + } + } + + RenderTile tile; + while (task.acquire_tile(this, tile, tile_types)) { + if (tile.task == RenderTile::PATH_TRACE) { + render(task, tile, &kg); + } + else if (tile.task == RenderTile::BAKE) { + render(task, tile, &kg); + } + else if (tile.task == RenderTile::DENOISE) { + denoise_openimagedenoise(task, tile); + task.update_progress(&tile, tile.w * tile.h); + } + + task.release_tile(tile); + + if (TaskPool::canceled()) { + if (task.need_finish_queue == false) + break; + } + } + + if (hold_denoise_lock) { + oidn_task_lock.unlock(); + } + + profiler.remove_state(&kg.profiler); + + delete denoising; +} + +void CPUDevice::thread_denoise(DeviceTask &task) +{ + RenderTile tile; + tile.x = task.x; + tile.y = task.y; + tile.w = task.w; + tile.h = task.h; + tile.buffer = task.buffer; + tile.sample = task.sample + task.num_samples; + tile.num_samples = task.num_samples; + tile.start_sample = task.sample; + tile.offset = task.offset; + tile.stride = task.stride; + tile.buffers = task.buffers; + + denoise_openimagedenoise(task, tile); + + task.update_progress(&tile, tile.w * tile.h); +} +#endif + +const CPUKernels *CPUDevice::get_cpu_kernels() const +{ + return &kernels; +} + +void CPUDevice::get_cpu_kernel_thread_globals( + vector<CPUKernelThreadGlobals> &kernel_thread_globals) +{ + /* Ensure latest texture info is loaded into kernel globals before returning. */ + load_texture_info(); + + kernel_thread_globals.clear(); + void *osl_memory = get_cpu_osl_memory(); + for (int i = 0; i < info.cpu_threads; i++) { + kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler); + } +} + +void *CPUDevice::get_cpu_osl_memory() +{ +#ifdef WITH_OSL + return &osl_globals; +#else + return NULL; +#endif +} + +bool CPUDevice::load_kernels(const uint /*kernel_features*/) +{ + return true; +} + +CCL_NAMESPACE_END |