diff options
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 11 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 23 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.cpp | 10 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 48 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_cpu.h | 139 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/filter_cpu.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h | 3 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/filter.cu | 23 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/filter.cl | 23 | ||||
-rw-r--r-- | intern/cycles/util/util_math.h | 24 | ||||
-rw-r--r-- | intern/cycles/util/util_math_fast.h | 28 | ||||
-rw-r--r-- | intern/cycles/util/util_math_float4.h | 12 | ||||
-rw-r--r-- | intern/cycles/util/util_math_int4.h | 40 | ||||
-rw-r--r-- | intern/cycles/util/util_types_int4.h | 2 | ||||
-rw-r--r-- | intern/cycles/util/util_types_int4_impl.h | 10 | ||||
-rw-r--r-- | source/blender/bmesh/operators/bmo_subdivide.c | 39 | ||||
-rwxr-xr-x | tests/python/modules/render_report.py | 2 |
17 files changed, 297 insertions, 141 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 7c72ab1a009..7eb73dea3ef 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -184,11 +184,11 @@ public: KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; - KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; - KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; - KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; + KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; @@ -499,6 +499,7 @@ public: filter_nlm_update_output_kernel()(dx, dy, blurDifference, (float*) image_ptr, + difference, (float*) out_ptr, weightAccum, local_rect, diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 2af8a9f48c8..22b1bc493c8 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1397,18 +1397,14 @@ public: int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; - int shift_stride = stride*h; + int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); - int mem_size = sizeof(float)*shift_stride*num_shifts; - - device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem"); - temporary_mem.alloc_to_device(2*mem_size); if(have_error()) return false; - CUdeviceptr difference = cuda_device_ptr(temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + mem_size; + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; { CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; @@ -1426,9 +1422,9 @@ public: task->reconstruction_state.source_w * task->reconstruction_state.source_h, num_shifts); - void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; void *construct_gramian_args[] = {&blurDifference, &task->buffer.mem.device_pointer, &task->storage.transform.device_pointer, @@ -1437,9 +1433,8 @@ public: &task->storage.XtWY.device_pointer, &task->reconstruction_state.filter_window, &w, &h, &stride, - &shift_stride, &r, - &f, - &task->buffer.pass_stride}; + &pass_stride, &r, + &f}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1448,8 +1443,6 @@ public: CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); } - temporary_mem.free(); - { CUfunction cuFinalize; cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 23c18fa15b2..78c65a3d22d 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -99,14 +99,18 @@ void DenoisingTask::setup_denoising_buffer() buffer.mem.alloc_to_device(mem_size, false); /* CPUs process shifts sequentially while GPUs process them in parallel. */ - int num_shifts = 1; + int num_layers; if(buffer.gpu_temporary_mem) { /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */ int max_radius = max(radius, 6); - num_shifts = (2*max_radius + 1) * (2*max_radius + 1); + int num_shifts = (2*max_radius + 1) * (2*max_radius + 1); + num_layers = 2*num_shifts + 1; + } + else { + num_layers = 3; } /* Allocate two layers per shift as well as one for the weight accumulation. */ - buffer.temporary_mem.alloc_to_device((2*num_shifts + 1) * buffer.pass_stride); + buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride); } void DenoisingTask::prefilter_shadowing() diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index cc887134bb0..ffd3b7d7625 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -865,38 +865,38 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; - int shift_stride = stride*h; - int num_shifts = (2*task->radius + 1)*(2*task->radius + 1); - int mem_size = sizeof(float)*shift_stride*num_shifts; + int r = task->radius; + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2*r+1)*(2*r+1); - cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); - cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); + device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); + cl_mem difference_mem = CL_MEM_PTR(*difference); + cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); kernel_set_args(ckNLMCalcDifference, 0, color_mem, color_variance_mem, - difference, + difference_mem, w, h, stride, - shift_stride, - task->radius, - task->buffer.pass_stride, + pass_stride, + r, + pass_stride, 1.0f, task->nlm_k_2); kernel_set_args(ckNLMBlur, 0, - difference, - blurDifference, + difference_mem, + blurDifference_mem, w, h, stride, - shift_stride, - task->radius, 4); + pass_stride, + r, 4); kernel_set_args(ckNLMCalcWeight, 0, - blurDifference, - difference, + blurDifference_mem, + difference_mem, w, h, stride, - shift_stride, - task->radius, 4); + pass_stride, + r, 4); kernel_set_args(ckNLMConstructGramian, 0, - blurDifference, + blurDifference_mem, buffer_mem, transform_mem, rank_mem, @@ -904,9 +904,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, XtWY_mem, task->reconstruction_state.filter_window, w, h, stride, - shift_stride, - task->radius, 4, - task->buffer.pass_stride); + pass_stride, + r, 4); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); @@ -914,9 +913,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); - opencl_assert(clReleaseMemObject(difference)); - opencl_assert(clReleaseMemObject(blurDifference)); - kernel_set_args(ckFinalize, 0, output_mem, rank_mem, diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index e2da0fd872b..af73c0dadf2 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -16,6 +16,9 @@ CCL_NAMESPACE_BEGIN +#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs)))) +#define load4_u(buf, ofs) load_float4((buf)+(ofs)) + ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, @@ -26,20 +29,28 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float a, float k_2) { + /* Strides need to be aligned to 16 bytes. */ + kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0); + + int aligned_lowx = rect.x & (~3); + const int numChannels = (channel_offset > 0)? 3 : 1; + const float4 channel_fac = make_float4(1.0f / numChannels); + for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - float diff = 0.0f; - int numChannels = channel_offset? 3 : 1; - for(int c = 0; c < numChannels; c++) { - float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - float pvar = variance_image[c*channel_offset + y*stride + x]; - float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); - } - if(numChannels > 1) { - diff *= 1.0f/numChannels; + int idx_p = y*stride + aligned_lowx; + int idx_q = (y+dy)*stride + aligned_lowx + dx; + for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { + float4 diff = make_float4(0.0f); + for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { + /* idx_p is guaranteed to be aligned, but idx_q isn't. */ + float4 color_p = load4_a(weight_image, idx_p + chan_ofs); + float4 color_q = load4_u(weight_image, idx_q + chan_ofs); + float4 cdiff = color_p - color_q; + float4 var_p = load4_a(variance_image, idx_p + chan_ofs); + float4 var_q = load4_u(variance_image, idx_q + chan_ofs); + diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); } - difference_image[y*stride + x] = diff; + load4_a(difference_image, idx_p) = diff*channel_fac; } } } @@ -50,52 +61,77 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen int stride, int f) { - int aligned_lowx = rect.x / 4; - int aligned_highx = (rect.z + 3) / 4; + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] = 0.0f; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = make_float4(0.0f); } for(int y1 = low; y1 < high; y1++) { - float4* out_image4 = (float4*)(out_image + y*stride); - float4* difference_image4 = (float4*)(difference_image + y1*stride); - for(int x = aligned_lowx; x < aligned_highx; x++) { - out_image4[x] += difference_image4[x]; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x); } } - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] *= 1.0f/(high - low); + float fac = 1.0f/(high - low); + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) *= fac; } } } -ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, - float *out_image, - int4 rect, - int stride, - int f) +ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int stride, + int f) { + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] = 0.0f; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = make_float4(0.0f); } } + for(int dx = -f; dx <= f; dx++) { - int pos_dx = max(0, dx); - int neg_dx = min(0, dx); + aligned_lowx = round_down(rect.x - min(0, dx), 4); + int highx = rect.z - max(0, dx); + int4 lowx4 = make_int4(rect.x - min(0, dx)); + int4 highx4 = make_int4(rect.z - max(0, dx)); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { - out_image[y*stride + x] += difference_image[y*stride + x+dx]; + for(int x = aligned_lowx; x < highx; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= lowx4) & (x4 < highx4); + + float4 diff = load4_u(difference_image, y*stride + x + dx); + load4_a(out_image, y*stride + x) += mask(active, diff); } } } + + aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f)); + for(int x = aligned_lowx; x < rect.z; x += 4) { + float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); + float4 low = max(make_float4(rect.x), x4 - make_float4(f)); + float4 high = min(make_float4(rect.z), x4 + make_float4(f+1)); + load4_a(out_image, y*stride + x) *= rcp(high - low); + } + } +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int stride, + int f) +{ + nlm_blur_horizontal(difference_image, out_image, rect, stride, f); + + int aligned_lowx = round_down(rect.x, 4); + for(int y = rect.y; y < rect.w; y++) { + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f))); } } } @@ -103,23 +139,29 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, const float *ccl_restrict difference_image, const float *ccl_restrict image, + float *temp_image, float *out_image, float *accum_image, int4 rect, int stride, int f) { + nlm_blur_horizontal(difference_image, temp_image, rect, stride, f); + + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - float sum = 0.0f; - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - float weight = sum * (1.0f/(high - low)); - accum_image[y*stride + x] += weight; - out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)]; + for(int x = aligned_lowx; x < rect.z; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); + + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); + + float4 weight = load4_a(temp_image, idx_p); + load4_a(accum_image, idx_p) += mask(active, weight); + + float4 val = load4_u(image, idx_q); + + load4_a(out_image, idx_p) += mask(active, weight*val); } } } @@ -177,4 +219,7 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, } } +#undef load4_a +#undef load4_u + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index b62aa9663ec..e036b53b810 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -95,6 +95,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, float *difference_image, float *image, + float *temp_image, float *out_image, float *accum_image, int* rect, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 26777fdabb2..4c758711481 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -191,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, float *difference_image, float *image, + float *temp_image, float *out_image, float *accum_image, int *rect, @@ -200,7 +201,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); #else - kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f); + kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f); #endif } diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index 0561c40e6b1..b856cbde45c 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -140,7 +140,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int channel_offset, float a, @@ -148,7 +148,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, @@ -165,13 +165,13 @@ kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_blur(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -186,13 +186,13 @@ kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_weight(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -209,13 +209,13 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, difference_image + ofs, image, @@ -252,14 +252,13 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im int w, int h, int stride, - int shift_stride, + int pass_stride, int r, - int f, - int pass_stride) + int f) { int4 co, rect; int ofs; - if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, difference_image + ofs, diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index 3c75754fb39..a550f97f4eb 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -132,7 +132,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int channel_offset, float a, @@ -140,7 +140,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, @@ -155,13 +155,13 @@ __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict di int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_blur(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -174,13 +174,13 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_rest int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_weight(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -195,13 +195,13 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, difference_image + ofs, image, @@ -234,14 +234,13 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc int w, int h, int stride, - int shift_stride, + int pass_stride, int r, - int f, - int pass_stride) + int f) { int4 co, rect; int ofs; - if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, difference_image + ofs, diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 52aeb8d8599..eafae5f31c0 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -220,6 +220,30 @@ ccl_device_inline float __uint_as_float(uint i) u.i = i; return u.f; } + +ccl_device_inline int4 __float4_as_int4(float4 f) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_castps_si128(f.m128)); + #else + return make_int4(__float_as_int(f.x), + __float_as_int(f.y), + __float_as_int(f.z), + __float_as_int(f.w)); +#endif +} + +ccl_device_inline float4 __int4_as_float4(int4 i) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_castsi128_ps(i.m128)); +#else + return make_float4(__int_as_float(i.x), + __int_as_float(i.y), + __int_as_float(i.z), + __int_as_float(i.w)); +#endif +} #endif /* __KERNEL_OPENCL__ */ /* Versions of functions which are safe for fast math. */ diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h index d3960deb3b4..323d40058e5 100644 --- a/intern/cycles/util/util_math_fast.h +++ b/intern/cycles/util/util_math_fast.h @@ -58,6 +58,11 @@ ccl_device_inline float madd(const float a, const float b, const float c) return a * b + c; } +ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c) +{ + return a * b + c; +} + /* * FAST & APPROXIMATE MATH * @@ -438,6 +443,29 @@ ccl_device_inline float fast_expf(float x) return fast_exp2f(x / M_LN2_F); } +#ifndef __KERNEL_GPU__ +ccl_device float4 fast_exp2f4(float4 x) +{ + const float4 one = make_float4(1.0f); + const float4 limit = make_float4(126.0f); + x = clamp(x, -limit, limit); + int4 m = make_int4(x); + x = one - (one - (x - make_float4(m))); + float4 r = make_float4(1.33336498402e-3f); + r = madd4(x, r, make_float4(9.810352697968e-3f)); + r = madd4(x, r, make_float4(5.551834031939e-2f)); + r = madd4(x, r, make_float4(0.2401793301105f)); + r = madd4(x, r, make_float4(0.693144857883f)); + r = madd4(x, r, make_float4(1.0f)); + return __int4_as_float4(__float4_as_int4(r) + (m << 23)); +} + +ccl_device_inline float4 fast_expf4(float4 x) +{ + return fast_exp2f4(x / M_LN2_F); +} +#endif + ccl_device_inline float fast_exp10(float x) { /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index aa7e56fefe9..105547098b5 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -38,6 +38,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b); ccl_device_inline float4 operator-(const float4& a, const float4& b); ccl_device_inline float4 operator+=(float4& a, const float4& b); ccl_device_inline float4 operator*=(float4& a, const float4& b); +ccl_device_inline float4 operator*=(float4& a, float f); ccl_device_inline float4 operator/=(float4& a, float f); ccl_device_inline int4 operator<(const float4& a, const float4& b); @@ -58,6 +59,7 @@ ccl_device_inline float4 normalize(const float4& a); ccl_device_inline float4 safe_normalize(const float4& a); ccl_device_inline float4 min(const float4& a, const float4& b); ccl_device_inline float4 max(const float4& a, const float4& b); +ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx); ccl_device_inline float4 fabs(const float4& a); #endif /* !__KERNEL_OPENCL__*/ @@ -168,6 +170,11 @@ ccl_device_inline float4 operator*=(float4& a, const float4& b) return a = a * b; } +ccl_device_inline float4 operator*=(float4& a, float f) +{ + return a = a * f; +} + ccl_device_inline float4 operator/=(float4& a, float f) { return a = a / f; @@ -333,6 +340,11 @@ ccl_device_inline float4 max(const float4& a, const float4& b) #endif } +ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx) +{ + return min(max(a, mn), mx); +} + ccl_device_inline float4 fabs(const float4& a) { #ifdef __KERNEL_SSE__ diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h index 79a8c0841e7..cde366b8c27 100644 --- a/intern/cycles/util/util_math_int4.h +++ b/intern/cycles/util/util_math_int4.h @@ -31,6 +31,10 @@ CCL_NAMESPACE_BEGIN ccl_device_inline int4 operator+(const int4& a, const int4& b); ccl_device_inline int4 operator+=(int4& a, const int4& b); ccl_device_inline int4 operator>>(const int4& a, int i); +ccl_device_inline int4 operator<<(const int4& a, int i); +ccl_device_inline int4 operator<(const int4& a, const int4& b); +ccl_device_inline int4 operator>=(const int4& a, const int4& b); +ccl_device_inline int4 operator&(const int4& a, const int4& b); ccl_device_inline int4 min(int4 a, int4 b); ccl_device_inline int4 max(int4 a, int4 b); ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx); @@ -65,6 +69,42 @@ ccl_device_inline int4 operator>>(const int4& a, int i) #endif } +ccl_device_inline int4 operator<<(const int4& a, int i) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_slli_epi32(a.m128, i)); +#else + return make_int4(a.x << i, a.y << i, a.z << i, a.w << i); +#endif +} + +ccl_device_inline int4 operator<(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_cmplt_epi32(a.m128, b.m128)); +#else + return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); +#endif +} + +ccl_device_inline int4 operator>=(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128))); +#else + return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); +#endif +} + +ccl_device_inline int4 operator&(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_and_si128(a.m128, b.m128)); +#else + return make_int4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); +#endif +} + ccl_device_inline int4 min(int4 a, int4 b) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h index cdd0ecbdae5..4ef162f1ac6 100644 --- a/intern/cycles/util/util_types_int4.h +++ b/intern/cycles/util/util_types_int4.h @@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN #ifndef __KERNEL_GPU__ struct float3; +struct float4; struct ccl_try_align(16) int4 { #ifdef __KERNEL_SSE__ @@ -53,6 +54,7 @@ struct ccl_try_align(16) int4 { ccl_device_inline int4 make_int4(int i); ccl_device_inline int4 make_int4(int x, int y, int z, int w); ccl_device_inline int4 make_int4(const float3& f); +ccl_device_inline int4 make_int4(const float4& f); ccl_device_inline void print_int4(const char *label, const int4& a); #endif /* __KERNEL_GPU__ */ diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h index 07cdc88f2dc..a62561709de 100644 --- a/intern/cycles/util/util_types_int4_impl.h +++ b/intern/cycles/util/util_types_int4_impl.h @@ -104,6 +104,16 @@ ccl_device_inline int4 make_int4(const float3& f) return a; } +ccl_device_inline int4 make_int4(const float4& f) +{ +#ifdef __KERNEL_SSE__ + int4 a(_mm_cvtps_epi32(f.m128)); +#else + int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; +#endif + return a; +} + ccl_device_inline void print_int4(const char *label, const int4& a) { printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); diff --git a/source/blender/bmesh/operators/bmo_subdivide.c b/source/blender/bmesh/operators/bmo_subdivide.c index c8ba2134a73..66730f8c837 100644 --- a/source/blender/bmesh/operators/bmo_subdivide.c +++ b/source/blender/bmesh/operators/bmo_subdivide.c @@ -182,15 +182,12 @@ static void interp_slerp_co_no_v3( /* calculate sphere 'center' */ { /* use point on plane to */ - float plane_a[4], plane_b[4], plane_c[4]; float no_mid[3], no_ortho[3]; /* pass this as an arg instead */ #if 0 float no_dir[3]; #endif - float v_a_no_ortho[3], v_b_no_ortho[3]; - add_v3_v3v3(no_mid, no_a, no_b); normalize_v3(no_mid); @@ -200,24 +197,28 @@ static void interp_slerp_co_no_v3( #endif /* axis of slerp */ + bool center_ok = false; cross_v3_v3v3(no_ortho, no_mid, no_dir); - normalize_v3(no_ortho); - - /* create planes */ - cross_v3_v3v3(v_a_no_ortho, no_ortho, no_a); - cross_v3_v3v3(v_b_no_ortho, no_ortho, no_b); - project_v3_plane(v_a_no_ortho, no_ortho, v_a_no_ortho); - project_v3_plane(v_b_no_ortho, no_ortho, v_b_no_ortho); - - plane_from_point_normal_v3(plane_a, co_a, v_a_no_ortho); - plane_from_point_normal_v3(plane_b, co_b, v_b_no_ortho); - plane_from_point_normal_v3(plane_c, co_b, no_ortho); - - /* find the sphere center from 3 planes */ - if (isect_plane_plane_plane_v3(plane_a, plane_b, plane_c, center)) { - /* pass */ + if (normalize_v3(no_ortho) != 0.0f) { + float plane_a[4], plane_b[4], plane_c[4]; + float v_a_no_ortho[3], v_b_no_ortho[3]; + + /* create planes */ + cross_v3_v3v3(v_a_no_ortho, no_ortho, no_a); + cross_v3_v3v3(v_b_no_ortho, no_ortho, no_b); + project_v3_plane(v_a_no_ortho, no_ortho, v_a_no_ortho); + project_v3_plane(v_b_no_ortho, no_ortho, v_b_no_ortho); + + plane_from_point_normal_v3(plane_a, co_a, v_a_no_ortho); + plane_from_point_normal_v3(plane_b, co_b, v_b_no_ortho); + plane_from_point_normal_v3(plane_c, co_b, no_ortho); + + /* find the sphere center from 3 planes */ + if (isect_plane_plane_plane_v3(plane_a, plane_b, plane_c, center)) { + center_ok = true; + } } - else { + if (center_ok == false) { mid_v3_v3v3(center, co_a, co_b); } } diff --git a/tests/python/modules/render_report.py b/tests/python/modules/render_report.py index 8deed25e854..9cff85f3dfa 100755 --- a/tests/python/modules/render_report.py +++ b/tests/python/modules/render_report.py @@ -374,7 +374,7 @@ class Report: testname = test_get_name(filepath) print_message(testname, 'SUCCESS', 'RUN') time_start = time.time() - tmp_filepath = os.path.join(self.output_dir, "tmp") + tmp_filepath = os.path.join(self.output_dir, "tmp_" + testname) error = render_cb(filepath, tmp_filepath) status = "FAIL" |