Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device/device_cpu.cpp')
-rw-r--r--intern/cycles/device/device_cpu.cpp846
1 files changed, 460 insertions, 386 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 84cce605182..1ecce8bd565 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -25,6 +25,7 @@
#endif
#include "device/device.h"
+#include "device/device_denoising.h"
#include "device/device_intern.h"
#include "device/device_split_kernel.h"
@@ -34,6 +35,8 @@
#include "kernel/split/kernel_split_data.h"
#include "kernel/kernel_globals.h"
+#include "kernel/filter/filter.h"
+
#include "kernel/osl/osl_shader.h"
#include "kernel/osl/osl_globals.h"
@@ -53,91 +56,107 @@ CCL_NAMESPACE_BEGIN
class CPUDevice;
-class CPUSplitKernel : public DeviceSplitKernel {
- CPUDevice *device;
-public:
- explicit CPUSplitKernel(CPUDevice *device);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
- RenderTile& rtile,
- int num_global_elements,
- device_memory& kernel_globals,
- device_memory& kernel_data_,
- device_memory& split_data,
- device_memory& ray_state,
- device_memory& queue_index,
- device_memory& use_queues_flag,
- device_memory& work_pool_wgs);
+/* Has to be outside of the class to be shared across template instantiations. */
+static const char *logged_architecture = "";
- virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
- virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
- virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-};
-
-class CPUDevice : public Device
-{
- static unordered_map<string, void*> kernel_functions;
-
- static void register_kernel_function(const char* name, void* func)
+template<typename F>
+class KernelFunctions {
+public:
+ KernelFunctions()
{
- kernel_functions[name] = func;
+ kernel = (F)NULL;
}
- static const char* get_arch_name()
+ KernelFunctions(F kernel_default,
+ F kernel_sse2,
+ F kernel_sse3,
+ F kernel_sse41,
+ F kernel_avx,
+ F kernel_avx2)
{
+ const char *architecture_name = "default";
+ kernel = kernel_default;
+
+ /* Silence potential warnings about unused variables
+ * when compiling without some architectures. */
+ (void)kernel_sse2;
+ (void)kernel_sse3;
+ (void)kernel_sse41;
+ (void)kernel_avx;
+ (void)kernel_avx2;
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
if(system_cpu_support_avx2()) {
- return "cpu_avx2";
+ architecture_name = "AVX2";
+ kernel = kernel_avx2;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
- return "cpu_avx";
+ architecture_name = "AVX";
+ kernel = kernel_avx;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
- return "cpu_sse41";
+ architecture_name = "SSE4.1";
+ kernel = kernel_sse41;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
- return "cpu_sse3";
+ architecture_name = "SSE3";
+ kernel = kernel_sse3;
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
- return "cpu_sse2";
+ architecture_name = "SSE2";
+ kernel = kernel_sse2;
}
- else
#endif
- {
- return "cpu";
+
+ if(strstr(architecture_name, logged_architecture) != 0) {
+ VLOG(1) << "Will be using " << architecture_name << " kernels.";
+ logged_architecture = architecture_name;
}
}
- template<typename F>
- static F get_kernel_function(string name)
- {
- name = string("kernel_") + get_arch_name() + "_" + name;
-
- unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+ inline F operator()() const {
+ assert(kernel);
+ return kernel;
+ }
+protected:
+ F kernel;
+};
- if(it == kernel_functions.end()) {
- assert(!"kernel function not found");
- return NULL;
- }
+class CPUSplitKernel : public DeviceSplitKernel {
+ CPUDevice *device;
+public:
+ explicit CPUSplitKernel(CPUDevice *device);
- return (F)it->second;
- }
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data_,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs);
- friend class CPUSplitKernel;
+ virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+ virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
+class CPUDevice : public Device
+{
public:
TaskPool task_pool;
KernelGlobals kernel_globals;
@@ -149,77 +168,89 @@ public:
bool use_split_kernel;
DeviceRequestedFeatures requested_features;
-
+
+ KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)> path_trace_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
+ KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
+
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
+ KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
+
+ KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
+ KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
+
+ KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
+
+ KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
+ ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
+ ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel;
+ unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
+
+#define KERNEL_FUNCTIONS(name) \
+ KERNEL_NAME_EVAL(cpu, name), \
+ KERNEL_NAME_EVAL(cpu_sse2, name), \
+ KERNEL_NAME_EVAL(cpu_sse3, name), \
+ KERNEL_NAME_EVAL(cpu_sse41, name), \
+ KERNEL_NAME_EVAL(cpu_avx, name), \
+ KERNEL_NAME_EVAL(cpu_avx2, name)
+
CPUDevice(DeviceInfo& info, Stats &stats, bool background)
- : Device(info, stats, background)
+ : Device(info, stats, background),
+#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
+ REGISTER_KERNEL(path_trace),
+ REGISTER_KERNEL(convert_to_half_float),
+ REGISTER_KERNEL(convert_to_byte),
+ REGISTER_KERNEL(shader),
+ REGISTER_KERNEL(filter_divide_shadow),
+ REGISTER_KERNEL(filter_get_feature),
+ REGISTER_KERNEL(filter_combine_halves),
+ REGISTER_KERNEL(filter_nlm_calc_difference),
+ REGISTER_KERNEL(filter_nlm_blur),
+ REGISTER_KERNEL(filter_nlm_calc_weight),
+ REGISTER_KERNEL(filter_nlm_update_output),
+ REGISTER_KERNEL(filter_nlm_normalize),
+ REGISTER_KERNEL(filter_construct_transform),
+ REGISTER_KERNEL(filter_nlm_construct_gramian),
+ REGISTER_KERNEL(filter_finalize),
+ REGISTER_KERNEL(data_init)
+#undef REGISTER_KERNEL
{
#ifdef WITH_OSL
kernel_globals.osl = &osl_globals;
#endif
-
- /* do now to avoid thread issues */
- system_cpu_support_sse2();
- system_cpu_support_sse3();
- system_cpu_support_sse41();
- system_cpu_support_avx();
- system_cpu_support_avx2();
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- VLOG(1) << "Will be using AVX2 kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- VLOG(1) << "Will be using AVX kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- VLOG(1) << "Will be using SSE4.1 kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- VLOG(1) << "Will be using SSE3kernels.";
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- VLOG(1) << "Will be using SSE2 kernels.";
- }
- else
-#endif
- {
- VLOG(1) << "Will be using regular kernels.";
- }
-
use_split_kernel = DebugFlags().cpu.split_kernel;
if(use_split_kernel) {
VLOG(1) << "Will be using split kernel.";
}
- kernel_cpu_register_functions(register_kernel_function);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- kernel_cpu_sse2_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- kernel_cpu_sse3_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- kernel_cpu_sse41_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- kernel_cpu_avx_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- kernel_cpu_avx2_register_functions(register_kernel_function);
-#endif
+#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
+ REGISTER_SPLIT_KERNEL(path_init);
+ REGISTER_SPLIT_KERNEL(scene_intersect);
+ REGISTER_SPLIT_KERNEL(lamp_emission);
+ REGISTER_SPLIT_KERNEL(do_volume);
+ REGISTER_SPLIT_KERNEL(queue_enqueue);
+ REGISTER_SPLIT_KERNEL(indirect_background);
+ REGISTER_SPLIT_KERNEL(shader_setup);
+ REGISTER_SPLIT_KERNEL(shader_sort);
+ REGISTER_SPLIT_KERNEL(shader_eval);
+ REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
+ REGISTER_SPLIT_KERNEL(subsurface_scatter);
+ REGISTER_SPLIT_KERNEL(direct_lighting);
+ REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
+ REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
+ REGISTER_SPLIT_KERNEL(next_iteration_setup);
+ REGISTER_SPLIT_KERNEL(indirect_subsurface);
+ REGISTER_SPLIT_KERNEL(buffer_update);
+#undef REGISTER_SPLIT_KERNEL
+#undef KERNEL_FUNCTIONS
}
~CPUDevice()
@@ -273,13 +304,17 @@ public:
if(!mem.data_pointer) {
free((void*)mem.device_pointer);
}
-
mem.device_pointer = 0;
stats.mem_free(mem.device_size);
mem.device_size = 0;
}
}
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+ {
+ return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
void const_copy_to(const char *name, void *host, size_t size)
{
kernel_const_copy(&kernel_globals, name, host, size);
@@ -326,13 +361,8 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE) {
- if(!use_split_kernel) {
- thread_path_trace(*task);
- }
- else {
- thread_path_trace_split(*task);
- }
+ if(task->type == DeviceTask::RENDER) {
+ thread_render(*task);
}
else if(task->type == DeviceTask::FILM_CONVERT)
thread_film_convert(*task);
@@ -349,116 +379,319 @@ public:
}
};
- void thread_path_trace(DeviceTask& task)
+ bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
{
- if(task_pool.canceled()) {
- if(task.need_finish_queue == false)
- return;
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+ TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+ for(int i = 0; i < 9; i++) {
+ tiles->buffers[i] = buffers[i];
}
- KernelGlobals kg = thread_kernel_globals_init();
- RenderTile tile;
+ return true;
+ }
- void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
+ bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ int4 rect = task->rect;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ int w = align_up(rect.z-rect.x, 4);
+ int h = rect.w-rect.y;
+
+ float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
+ float *difference = (float*) task->nlm_state.temporary_2_ptr;
+ float *weightAccum = (float*) task->nlm_state.temporary_3_ptr;
+
+ memset(weightAccum, 0, sizeof(float)*w*h);
+ memset((float*) out_ptr, 0, sizeof(float)*w*h);
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy,
+ (float*) guide_ptr,
+ (float*) variance_ptr,
+ difference,
+ local_rect,
+ w, 0,
+ a, k_2);
+
+ filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+ filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
+
+ filter_nlm_update_output_kernel()(dx, dy,
+ blurDifference,
+ (float*) image_ptr,
+ (float*) out_ptr,
+ weightAccum,
+ local_rect,
+ w, f);
+ }
+
+ int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
+ filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- path_trace_kernel = kernel_cpu_avx2_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- path_trace_kernel = kernel_cpu_avx_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- path_trace_kernel = kernel_cpu_sse41_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- path_trace_kernel = kernel_cpu_sse3_path_trace;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- path_trace_kernel = kernel_cpu_sse2_path_trace;
- }
- else
-#endif
- {
- path_trace_kernel = kernel_cpu_path_trace;
+ return true;
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
+ x + task->filter_area.x,
+ y + task->filter_area.y,
+ y*task->filter_area.z + x,
+ (float*) task->storage.transform.device_pointer,
+ (int*) task->storage.rank.device_pointer,
+ &task->rect.x,
+ task->buffer.pass_stride,
+ task->radius,
+ task->pca_threshold);
+ }
}
+ return true;
+ }
- while(task.acquire_tile(this, tile)) {
- float *render_buffer = (float*)tile.buffer;
- uint *rng_state = (uint*)tile.rng_state;
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task.get_cancel() || task_pool.canceled()) {
- if(task.need_finish_queue == false)
- break;
- }
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ float *difference = (float*) task->reconstruction_state.temporary_1_ptr;
+ float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
+
+ int r = task->radius;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy,
+ (float*) guide_ptr,
+ (float*) guide_variance_ptr,
+ difference,
+ local_rect,
+ task->buffer.w,
+ task->buffer.pass_stride,
+ 1.0f,
+ task->nlm_k_2);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+ filter_nlm_construct_gramian_kernel()(dx, dy,
+ blurDifference,
+ (float*) task->buffer.mem.device_pointer,
+ (float*) color_ptr,
+ (float*) color_variance_ptr,
+ (float*) task->storage.transform.device_pointer,
+ (int*) task->storage.rank.device_pointer,
+ (float*) task->storage.XtWX.device_pointer,
+ (float3*) task->storage.XtWY.device_pointer,
+ local_rect,
+ &task->reconstruction_state.filter_rect.x,
+ task->buffer.w,
+ task->buffer.h,
+ 4,
+ task->buffer.pass_stride);
+ }
+ for(int y = 0; y < task->filter_area.w; y++) {
+ for(int x = 0; x < task->filter_area.z; x++) {
+ filter_finalize_kernel()(x,
+ y,
+ y*task->filter_area.z + x,
+ task->buffer.w,
+ task->buffer.h,
+ (float*) output_ptr,
+ (int*) task->storage.rank.device_pointer,
+ (float*) task->storage.XtWX.device_pointer,
+ (float3*) task->storage.XtWY.device_pointer,
+ &task->reconstruction_state.buffer_params.x,
+ task->render_buffer.samples);
+ }
+ }
+ return true;
+ }
- for(int y = tile.y; y < tile.y + tile.h; y++) {
- for(int x = tile.x; x < tile.x + tile.w; x++) {
- path_trace_kernel(&kg, render_buffer, rng_state,
- sample, x, y, tile.offset, tile.stride);
- }
- }
+ bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr mean_ptr, device_ptr variance_ptr,
+ int r, int4 rect, DenoisingTask *task)
+ {
+ (void) task;
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ filter_combine_halves_kernel()(x, y,
+ (float*) mean_ptr,
+ (float*) variance_ptr,
+ (float*) a_ptr,
+ (float*) b_ptr,
+ &rect.x,
+ r);
+ }
+ }
+ return true;
+ }
- tile.sample = sample + 1;
+ bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr, DenoisingTask *task)
+ {
+ for(int y = task->rect.y; y < task->rect.w; y++) {
+ for(int x = task->rect.x; x < task->rect.z; x++) {
+ filter_divide_shadow_kernel()(task->render_buffer.samples,
+ task->tiles,
+ x, y,
+ (float*) a_ptr,
+ (float*) b_ptr,
+ (float*) sample_variance_ptr,
+ (float*) sv_variance_ptr,
+ (float*) buffer_variance_ptr,
+ &task->rect.x,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ use_split_kernel);
+ }
+ }
+ return true;
+ }
- task.update_progress(&tile, tile.w*tile.h);
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+ {
+ for(int y = task->rect.y; y < task->rect.w; y++) {
+ for(int x = task->rect.x; x < task->rect.z; x++) {
+ filter_get_feature_kernel()(task->render_buffer.samples,
+ task->tiles,
+ mean_offset,
+ variance_offset,
+ x, y,
+ (float*) mean_ptr,
+ (float*) variance_ptr,
+ &task->rect.x,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ use_split_kernel);
}
+ }
+ return true;
+ }
- task.release_tile(tile);
+ void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+ {
+ float *render_buffer = (float*)tile.buffer;
+ uint *rng_state = (uint*)tile.rng_state;
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- if(task_pool.canceled()) {
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task.get_cancel() || task_pool.canceled()) {
if(task.need_finish_queue == false)
break;
}
+
+ for(int y = tile.y; y < tile.y + tile.h; y++) {
+ for(int x = tile.x; x < tile.x + tile.w; x++) {
+ path_trace_kernel()(kg, render_buffer, rng_state,
+ sample, x, y, tile.offset, tile.stride);
+ }
+ }
+
+ tile.sample = sample + 1;
+
+ task.update_progress(&tile, tile.w*tile.h);
}
+ }
+
+ void denoise(DeviceTask &task, RenderTile &tile)
+ {
+ tile.sample = tile.start_sample + tile.num_samples;
+
+ DenoisingTask denoising(this);
- thread_kernel_globals_free(&kg);
+ denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
+
+ denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
+ denoising.render_buffer.samples = tile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = tile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+
+ task.update_progress(&tile, tile.w*tile.h);
}
- void thread_path_trace_split(DeviceTask& task)
+ void thread_render(DeviceTask& task)
{
if(task_pool.canceled()) {
if(task.need_finish_queue == false)
return;
}
- RenderTile tile;
-
- CPUSplitKernel split_kernel(this);
-
/* allocate buffer for kernel globals */
- device_memory kgbuffer;
- kgbuffer.resize(sizeof(KernelGlobals));
+ device_only_memory<KernelGlobals> kgbuffer;
+ kgbuffer.resize(1);
mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
- requested_features.max_closure = MAX_CLOSURE;
- if(!split_kernel.load_kernels(requested_features)) {
- thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
- mem_free(kgbuffer);
+ CPUSplitKernel *split_kernel = NULL;
+ if(use_split_kernel) {
+ split_kernel = new CPUSplitKernel(this);
+ requested_features.max_closure = MAX_CLOSURE;
+ if(!split_kernel->load_kernels(requested_features)) {
+ thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+ mem_free(kgbuffer);
- return;
+ delete split_kernel;
+ return;
+ }
}
+ RenderTile tile;
while(task.acquire_tile(this, tile)) {
- device_memory data;
- split_kernel.path_trace(&task, tile, kgbuffer, data);
+ if(tile.task == RenderTile::PATH_TRACE) {
+ if(use_split_kernel) {
+ device_memory data;
+ split_kernel->path_trace(&task, tile, kgbuffer, data);
+ }
+ else {
+ path_trace(task, tile, kg);
+ }
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ denoise(task, tile);
+ }
task.release_tile(tile);
@@ -470,6 +703,7 @@ public:
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
mem_free(kgbuffer);
+ delete split_kernel;
}
void thread_film_convert(DeviceTask& task)
@@ -477,86 +711,16 @@ public:
float sample_scale = 1.0f/(task.sample + 1);
if(task.rgba_half) {
- void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
- }
- else
-#endif
- {
- convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
- }
-
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
- convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
- sample_scale, x, y, task.offset, task.stride);
+ convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+ sample_scale, x, y, task.offset, task.stride);
}
else {
- void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
- }
- else
-#endif
- {
- convert_to_byte_kernel = kernel_cpu_convert_to_byte;
- }
-
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
- convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
- sample_scale, x, y, task.offset, task.stride);
+ convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+ sample_scale, x, y, task.offset, task.stride);
}
}
@@ -568,53 +732,17 @@ public:
#ifdef WITH_OSL
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
- void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- shader_kernel = kernel_cpu_avx2_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- shader_kernel = kernel_cpu_avx_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- shader_kernel = kernel_cpu_sse41_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- shader_kernel = kernel_cpu_sse3_shader;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- shader_kernel = kernel_cpu_sse2_shader;
- }
- else
-#endif
- {
- shader_kernel = kernel_cpu_shader;
- }
-
for(int sample = 0; sample < task.num_samples; sample++) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
- shader_kernel(&kg,
- (uint4*)task.shader_input,
- (float4*)task.shader_output,
- (float*)task.shader_output_luma,
- task.shader_eval_type,
- task.shader_filter,
- x,
- task.offset,
- sample);
+ shader_kernel()(&kg,
+ (uint4*)task.shader_input,
+ (float4*)task.shader_output,
+ (float*)task.shader_output_luma,
+ task.shader_eval_type,
+ task.shader_filter,
+ x,
+ task.offset,
+ sample);
if(task.get_cancel() || task_pool.canceled())
break;
@@ -751,58 +879,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
device_memory& use_queues_flags,
device_memory& work_pool_wgs)
{
- typedef void(*data_init_t)(KernelGlobals *kg,
- ccl_constant KernelData *data,
- ccl_global void *split_data_buffer,
- int num_elements,
- ccl_global char *ray_state,
- ccl_global uint *rng_state,
- int start_sample,
- int end_sample,
- int sx, int sy, int sw, int sh, int offset, int stride,
- ccl_global int *Queue_index,
- int queuesize,
- ccl_global char *use_queues_flag,
- ccl_global unsigned int *work_pool_wgs,
- unsigned int num_samples,
- ccl_global float *buffer);
-
- data_init_t data_init;
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(system_cpu_support_avx2()) {
- data_init = kernel_cpu_avx2_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(system_cpu_support_avx()) {
- data_init = kernel_cpu_avx_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(system_cpu_support_sse41()) {
- data_init = kernel_cpu_sse41_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(system_cpu_support_sse3()) {
- data_init = kernel_cpu_sse3_data_init;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(system_cpu_support_sse2()) {
- data_init = kernel_cpu_sse2_data_init;
- }
- else
-#endif
- {
- data_init = kernel_cpu_data_init;
- }
-
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
@@ -810,26 +886,26 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
for(int x = 0; x < dim.global_size[0]; x++) {
kg->global_id = make_int2(x, y);
- data_init((KernelGlobals*)kernel_globals.device_pointer,
- (KernelData*)data.device_pointer,
- (void*)split_data.device_pointer,
- num_global_elements,
- (char*)ray_state.device_pointer,
- (uint*)rtile.rng_state,
- rtile.start_sample,
- rtile.start_sample + rtile.num_samples,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- rtile.offset,
- rtile.stride,
- (int*)queue_index.device_pointer,
- dim.global_size[0] * dim.global_size[1],
- (char*)use_queues_flags.device_pointer,
- (uint*)work_pool_wgs.device_pointer,
- rtile.num_samples,
- (float*)rtile.buffer);
+ device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
+ (KernelData*)data.device_pointer,
+ (void*)split_data.device_pointer,
+ num_global_elements,
+ (char*)ray_state.device_pointer,
+ (uint*)rtile.rng_state,
+ rtile.start_sample,
+ rtile.start_sample + rtile.num_samples,
+ rtile.x,
+ rtile.y,
+ rtile.w,
+ rtile.h,
+ rtile.offset,
+ rtile.stride,
+ (int*)queue_index.device_pointer,
+ dim.global_size[0] * dim.global_size[1],
+ (char*)use_queues_flags.device_pointer,
+ (uint*)work_pool_wgs.device_pointer,
+ rtile.num_samples,
+ (float*)rtile.buffer);
}
}
@@ -840,7 +916,7 @@ SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_nam
{
CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
- kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+ kernel->func = device->split_kernels[kernel_name]();
if(!kernel->func) {
delete kernel;
return NULL;
@@ -864,8 +940,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device
return split_data_buffer_size(kg, num_threads);
}
-unordered_map<string, void*> CPUDevice::kernel_functions;
-
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
{
return new CPUDevice(info, stats, background);