diff options
Diffstat (limited to 'intern/cycles/device/device_cpu.cpp')
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 846 |
1 files changed, 460 insertions, 386 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 84cce605182..1ecce8bd565 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -25,6 +25,7 @@ #endif #include "device/device.h" +#include "device/device_denoising.h" #include "device/device_intern.h" #include "device/device_split_kernel.h" @@ -34,6 +35,8 @@ #include "kernel/split/kernel_split_data.h" #include "kernel/kernel_globals.h" +#include "kernel/filter/filter.h" + #include "kernel/osl/osl_shader.h" #include "kernel/osl/osl_globals.h" @@ -53,91 +56,107 @@ CCL_NAMESPACE_BEGIN class CPUDevice; -class CPUSplitKernel : public DeviceSplitKernel { - CPUDevice *device; -public: - explicit CPUSplitKernel(CPUDevice *device); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& kernel_globals, - device_memory& kernel_data_, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs); +/* Has to be outside of the class to be shared across template instantiations. */ +static const char *logged_architecture = ""; - virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); - virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); -}; - -class CPUDevice : public Device -{ - static unordered_map<string, void*> kernel_functions; - - static void register_kernel_function(const char* name, void* func) +template<typename F> +class KernelFunctions { +public: + KernelFunctions() { - kernel_functions[name] = func; + kernel = (F)NULL; } - static const char* get_arch_name() + KernelFunctions(F kernel_default, + F kernel_sse2, + F kernel_sse3, + F kernel_sse41, + F kernel_avx, + F kernel_avx2) { + const char *architecture_name = "default"; + kernel = kernel_default; + + /* Silence potential warnings about unused variables + * when compiling without some architectures. */ + (void)kernel_sse2; + (void)kernel_sse3; + (void)kernel_sse41; + (void)kernel_avx; + (void)kernel_avx2; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) { - return "cpu_avx2"; + architecture_name = "AVX2"; + kernel = kernel_avx2; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { - return "cpu_avx"; + architecture_name = "AVX"; + kernel = kernel_avx; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { - return "cpu_sse41"; + architecture_name = "SSE4.1"; + kernel = kernel_sse41; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { - return "cpu_sse3"; + architecture_name = "SSE3"; + kernel = kernel_sse3; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { - return "cpu_sse2"; + architecture_name = "SSE2"; + kernel = kernel_sse2; } - else #endif - { - return "cpu"; + + if(strstr(architecture_name, logged_architecture) != 0) { + VLOG(1) << "Will be using " << architecture_name << " kernels."; + logged_architecture = architecture_name; } } - template<typename F> - static F get_kernel_function(string name) - { - name = string("kernel_") + get_arch_name() + "_" + name; - - unordered_map<string, void*>::iterator it = kernel_functions.find(name); + inline F operator()() const { + assert(kernel); + return kernel; + } +protected: + F kernel; +}; - if(it == kernel_functions.end()) { - assert(!"kernel function not found"); - return NULL; - } +class CPUSplitKernel : public DeviceSplitKernel { + CPUDevice *device; +public: + explicit CPUSplitKernel(CPUDevice *device); - return (F)it->second; - } + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); - friend class CPUSplitKernel; + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); +}; +class CPUDevice : public Device +{ public: TaskPool task_pool; KernelGlobals kernel_globals; @@ -149,77 +168,89 @@ public: bool use_split_kernel; DeviceRequestedFeatures requested_features; - + + KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)> path_trace_kernel; + KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel; + KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; + KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel; + + KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel; + KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; + + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; + KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; + + KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + + KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, + ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int, + ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel; + unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels; + +#define KERNEL_FUNCTIONS(name) \ + KERNEL_NAME_EVAL(cpu, name), \ + KERNEL_NAME_EVAL(cpu_sse2, name), \ + KERNEL_NAME_EVAL(cpu_sse3, name), \ + KERNEL_NAME_EVAL(cpu_sse41, name), \ + KERNEL_NAME_EVAL(cpu_avx, name), \ + KERNEL_NAME_EVAL(cpu_avx2, name) + CPUDevice(DeviceInfo& info, Stats &stats, bool background) - : Device(info, stats, background) + : Device(info, stats, background), +#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name)) + REGISTER_KERNEL(path_trace), + REGISTER_KERNEL(convert_to_half_float), + REGISTER_KERNEL(convert_to_byte), + REGISTER_KERNEL(shader), + REGISTER_KERNEL(filter_divide_shadow), + REGISTER_KERNEL(filter_get_feature), + REGISTER_KERNEL(filter_combine_halves), + REGISTER_KERNEL(filter_nlm_calc_difference), + REGISTER_KERNEL(filter_nlm_blur), + REGISTER_KERNEL(filter_nlm_calc_weight), + REGISTER_KERNEL(filter_nlm_update_output), + REGISTER_KERNEL(filter_nlm_normalize), + REGISTER_KERNEL(filter_construct_transform), + REGISTER_KERNEL(filter_nlm_construct_gramian), + REGISTER_KERNEL(filter_finalize), + REGISTER_KERNEL(data_init) +#undef REGISTER_KERNEL { #ifdef WITH_OSL kernel_globals.osl = &osl_globals; #endif - - /* do now to avoid thread issues */ - system_cpu_support_sse2(); - system_cpu_support_sse3(); - system_cpu_support_sse41(); - system_cpu_support_avx(); - system_cpu_support_avx2(); - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - VLOG(1) << "Will be using AVX2 kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - VLOG(1) << "Will be using AVX kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - VLOG(1) << "Will be using SSE4.1 kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - VLOG(1) << "Will be using SSE3kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - VLOG(1) << "Will be using SSE2 kernels."; - } - else -#endif - { - VLOG(1) << "Will be using regular kernels."; - } - use_split_kernel = DebugFlags().cpu.split_kernel; if(use_split_kernel) { VLOG(1) << "Will be using split kernel."; } - kernel_cpu_register_functions(register_kernel_function); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - kernel_cpu_sse2_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - kernel_cpu_sse3_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - kernel_cpu_sse41_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - kernel_cpu_avx_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - kernel_cpu_avx2_register_functions(register_kernel_function); -#endif +#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name)) + REGISTER_SPLIT_KERNEL(path_init); + REGISTER_SPLIT_KERNEL(scene_intersect); + REGISTER_SPLIT_KERNEL(lamp_emission); + REGISTER_SPLIT_KERNEL(do_volume); + REGISTER_SPLIT_KERNEL(queue_enqueue); + REGISTER_SPLIT_KERNEL(indirect_background); + REGISTER_SPLIT_KERNEL(shader_setup); + REGISTER_SPLIT_KERNEL(shader_sort); + REGISTER_SPLIT_KERNEL(shader_eval); + REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao); + REGISTER_SPLIT_KERNEL(subsurface_scatter); + REGISTER_SPLIT_KERNEL(direct_lighting); + REGISTER_SPLIT_KERNEL(shadow_blocked_ao); + REGISTER_SPLIT_KERNEL(shadow_blocked_dl); + REGISTER_SPLIT_KERNEL(next_iteration_setup); + REGISTER_SPLIT_KERNEL(indirect_subsurface); + REGISTER_SPLIT_KERNEL(buffer_update); +#undef REGISTER_SPLIT_KERNEL +#undef KERNEL_FUNCTIONS } ~CPUDevice() @@ -273,13 +304,17 @@ public: if(!mem.data_pointer) { free((void*)mem.device_pointer); } - mem.device_pointer = 0; stats.mem_free(mem.device_size); mem.device_size = 0; } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { kernel_const_copy(&kernel_globals, name, host, size); @@ -326,13 +361,8 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) { - if(!use_split_kernel) { - thread_path_trace(*task); - } - else { - thread_path_trace_split(*task); - } + if(task->type == DeviceTask::RENDER) { + thread_render(*task); } else if(task->type == DeviceTask::FILM_CONVERT) thread_film_convert(*task); @@ -349,116 +379,319 @@ public: } }; - void thread_path_trace(DeviceTask& task) + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) { - if(task_pool.canceled()) { - if(task.need_finish_queue == false) - return; + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); + + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; } - KernelGlobals kg = thread_kernel_globals_init(); - RenderTile tile; + return true; + } - void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int); + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + int4 rect = task->rect; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int w = align_up(rect.z-rect.x, 4); + int h = rect.w-rect.y; + + float *blurDifference = (float*) task->nlm_state.temporary_1_ptr; + float *difference = (float*) task->nlm_state.temporary_2_ptr; + float *weightAccum = (float*) task->nlm_state.temporary_3_ptr; + + memset(weightAccum, 0, sizeof(float)*w*h); + memset((float*) out_ptr, 0, sizeof(float)*w*h); + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) guide_ptr, + (float*) variance_ptr, + difference, + local_rect, + w, 0, + a, k_2); + + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + + filter_nlm_update_output_kernel()(dx, dy, + blurDifference, + (float*) image_ptr, + (float*) out_ptr, + weightAccum, + local_rect, + w, f); + } + + int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y}; + filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - path_trace_kernel = kernel_cpu_avx2_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - path_trace_kernel = kernel_cpu_avx_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - path_trace_kernel = kernel_cpu_sse41_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - path_trace_kernel = kernel_cpu_sse3_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - path_trace_kernel = kernel_cpu_sse2_path_trace; - } - else -#endif - { - path_trace_kernel = kernel_cpu_path_trace; + return true; + } + + bool denoising_construct_transform(DenoisingTask *task) + { + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, + x + task->filter_area.x, + y + task->filter_area.y, + y*task->filter_area.z + x, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + &task->rect.x, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + } } + return true; + } - while(task.acquire_tile(this, tile)) { - float *render_buffer = (float*)tile.buffer; - uint *rng_state = (uint*)tile.rng_state; - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task.get_cancel() || task_pool.canceled()) { - if(task.need_finish_queue == false) - break; - } + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + float *difference = (float*) task->reconstruction_state.temporary_1_ptr; + float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr; + + int r = task->radius; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) guide_ptr, + (float*) guide_variance_ptr, + difference, + local_rect, + task->buffer.w, + task->buffer.pass_stride, + 1.0f, + task->nlm_k_2); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_construct_gramian_kernel()(dx, dy, + blurDifference, + (float*) task->buffer.mem.device_pointer, + (float*) color_ptr, + (float*) color_variance_ptr, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + local_rect, + &task->reconstruction_state.filter_rect.x, + task->buffer.w, + task->buffer.h, + 4, + task->buffer.pass_stride); + } + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_finalize_kernel()(x, + y, + y*task->filter_area.z + x, + task->buffer.w, + task->buffer.h, + (float*) output_ptr, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + &task->reconstruction_state.buffer_params.x, + task->render_buffer.samples); + } + } + return true; + } - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - path_trace_kernel(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); - } - } + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) + { + (void) task; + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + filter_combine_halves_kernel()(x, y, + (float*) mean_ptr, + (float*) variance_ptr, + (float*) a_ptr, + (float*) b_ptr, + &rect.x, + r); + } + } + return true; + } - tile.sample = sample + 1; + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_divide_shadow_kernel()(task->render_buffer.samples, + task->tiles, + x, y, + (float*) a_ptr, + (float*) b_ptr, + (float*) sample_variance_ptr, + (float*) sv_variance_ptr, + (float*) buffer_variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + use_split_kernel); + } + } + return true; + } - task.update_progress(&tile, tile.w*tile.h); + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_get_feature_kernel()(task->render_buffer.samples, + task->tiles, + mean_offset, + variance_offset, + x, y, + (float*) mean_ptr, + (float*) variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + use_split_kernel); } + } + return true; + } - task.release_tile(tile); + void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) + { + float *render_buffer = (float*)tile.buffer; + uint *rng_state = (uint*)tile.rng_state; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - if(task_pool.canceled()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if(task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + path_trace_kernel()(kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(&tile, tile.w*tile.h); } + } + + void denoise(DeviceTask &task, RenderTile &tile) + { + tile.sample = tile.start_sample + tile.num_samples; + + DenoisingTask denoising(this); - thread_kernel_globals_free(&kg); + denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); + denoising.render_buffer.samples = tile.sample; + + RenderTile rtiles[9]; + rtiles[4] = tile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); + + task.update_progress(&tile, tile.w*tile.h); } - void thread_path_trace_split(DeviceTask& task) + void thread_render(DeviceTask& task) { if(task_pool.canceled()) { if(task.need_finish_queue == false) return; } - RenderTile tile; - - CPUSplitKernel split_kernel(this); - /* allocate buffer for kernel globals */ - device_memory kgbuffer; - kgbuffer.resize(sizeof(KernelGlobals)); + device_only_memory<KernelGlobals> kgbuffer; + kgbuffer.resize(1); mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE); KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init()); - requested_features.max_closure = MAX_CLOSURE; - if(!split_kernel.load_kernels(requested_features)) { - thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); - mem_free(kgbuffer); + CPUSplitKernel *split_kernel = NULL; + if(use_split_kernel) { + split_kernel = new CPUSplitKernel(this); + requested_features.max_closure = MAX_CLOSURE; + if(!split_kernel->load_kernels(requested_features)) { + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + mem_free(kgbuffer); - return; + delete split_kernel; + return; + } } + RenderTile tile; while(task.acquire_tile(this, tile)) { - device_memory data; - split_kernel.path_trace(&task, tile, kgbuffer, data); + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel) { + device_memory data; + split_kernel->path_trace(&task, tile, kgbuffer, data); + } + else { + path_trace(task, tile, kg); + } + } + else if(tile.task == RenderTile::DENOISE) { + denoise(task, tile); + } task.release_tile(tile); @@ -470,6 +703,7 @@ public: thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); mem_free(kgbuffer); + delete split_kernel; } void thread_film_convert(DeviceTask& task) @@ -477,86 +711,16 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { - void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float; - } - else -#endif - { - convert_to_half_float_kernel = kernel_cpu_convert_to_half_float; - } - for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); + convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); } else { - void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte; - } - else -#endif - { - convert_to_byte_kernel = kernel_cpu_convert_to_byte; - } - for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); + convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); } } @@ -568,53 +732,17 @@ public: #ifdef WITH_OSL OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif - void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int); - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - shader_kernel = kernel_cpu_avx2_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - shader_kernel = kernel_cpu_avx_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - shader_kernel = kernel_cpu_sse41_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - shader_kernel = kernel_cpu_sse3_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - shader_kernel = kernel_cpu_sse2_shader; - } - else -#endif - { - shader_kernel = kernel_cpu_shader; - } - for(int sample = 0; sample < task.num_samples; sample++) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) - shader_kernel(&kg, - (uint4*)task.shader_input, - (float4*)task.shader_output, - (float*)task.shader_output_luma, - task.shader_eval_type, - task.shader_filter, - x, - task.offset, - sample); + shader_kernel()(&kg, + (uint4*)task.shader_input, + (float4*)task.shader_output, + (float*)task.shader_output_luma, + task.shader_eval_type, + task.shader_filter, + x, + task.offset, + sample); if(task.get_cancel() || task_pool.canceled()) break; @@ -751,58 +879,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, device_memory& use_queues_flags, device_memory& work_pool_wgs) { - typedef void(*data_init_t)(KernelGlobals *kg, - ccl_constant KernelData *data, - ccl_global void *split_data_buffer, - int num_elements, - ccl_global char *ray_state, - ccl_global uint *rng_state, - int start_sample, - int end_sample, - int sx, int sy, int sw, int sh, int offset, int stride, - ccl_global int *Queue_index, - int queuesize, - ccl_global char *use_queues_flag, - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, - ccl_global float *buffer); - - data_init_t data_init; - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - data_init = kernel_cpu_avx2_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - data_init = kernel_cpu_avx_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - data_init = kernel_cpu_sse41_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - data_init = kernel_cpu_sse3_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - data_init = kernel_cpu_sse2_data_init; - } - else -#endif - { - data_init = kernel_cpu_data_init; - } - KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); @@ -810,26 +886,26 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, for(int x = 0; x < dim.global_size[0]; x++) { kg->global_id = make_int2(x, y); - data_init((KernelGlobals*)kernel_globals.device_pointer, - (KernelData*)data.device_pointer, - (void*)split_data.device_pointer, - num_global_elements, - (char*)ray_state.device_pointer, - (uint*)rtile.rng_state, - rtile.start_sample, - rtile.start_sample + rtile.num_samples, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - (int*)queue_index.device_pointer, - dim.global_size[0] * dim.global_size[1], - (char*)use_queues_flags.device_pointer, - (uint*)work_pool_wgs.device_pointer, - rtile.num_samples, - (float*)rtile.buffer); + device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer, + (KernelData*)data.device_pointer, + (void*)split_data.device_pointer, + num_global_elements, + (char*)ray_state.device_pointer, + (uint*)rtile.rng_state, + rtile.start_sample, + rtile.start_sample + rtile.num_samples, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + (int*)queue_index.device_pointer, + dim.global_size[0] * dim.global_size[1], + (char*)use_queues_flags.device_pointer, + (uint*)work_pool_wgs.device_pointer, + rtile.num_samples, + (float*)rtile.buffer); } } @@ -840,7 +916,7 @@ SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_nam { CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); - kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name); + kernel->func = device->split_kernels[kernel_name](); if(!kernel->func) { delete kernel; return NULL; @@ -864,8 +940,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device return split_data_buffer_size(kg, num_threads); } -unordered_map<string, void*> CPUDevice::kernel_functions; - Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) { return new CPUDevice(info, stats, background); |