diff options
author | Sergey Sharybin <sergey.vfx@gmail.com> | 2017-05-19 13:33:28 +0300 |
---|---|---|
committer | Sergey Sharybin <sergey.vfx@gmail.com> | 2017-05-19 13:41:03 +0300 |
commit | 803337f3f64fed240e9adc6f286d5f9d13a5026a (patch) | |
tree | a262b427bd53873be9d18e952f193d66801edac8 /intern/cycles | |
parent | 8e655446d1ec667a08a6d351d1e452fc51f1428a (diff) |
\0;115;0cCycles: Cleanup, use ccl_restrict instead of ccl_restrict_ptr
There were following issues with ccl_restrict_ptr:
- We already had ccl_restrict for all platforms.
- It was secretly adding `const` qualifier to the declaration,
which is quite weird since non-const pointer can also be
declared as restricted.
- We never in Blender are using foo_ptr or FooPtr type definitions,
so not sure why we should introduce such a thing here.
- It is absolutely wrong from semantic point of view to put pointer
into the restrict macro -- const is a part of type, not part of
hint for compiler that some pointer is never aliased.
Diffstat (limited to 'intern/cycles')
-rw-r--r-- | intern/cycles/kernel/filter/filter_features.h | 26 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_features_sse.h | 14 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_cpu.h | 40 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_gpu.h | 20 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_prefilter.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_reconstruction.h | 4 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform.h | 4 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform_gpu.h | 4 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform_sse.h | 4 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_compat_cpu.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_compat_cuda.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_compat_opencl.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/filter.cu | 18 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/filter.cl | 22 | ||||
-rw-r--r-- | intern/cycles/util/util_math_matrix.h | 22 |
15 files changed, 113 insertions, 72 deletions
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h index 41998c792b6..53d703de143 100644 --- a/intern/cycles/kernel/filter/filter_features.h +++ b/intern/cycles/kernel/filter/filter_features.h @@ -28,7 +28,11 @@ pixel_buffer += buffer_w - (high.x - low.x); \ } -ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride) +ccl_device_inline void filter_get_features(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *features, + const float *ccl_restrict mean, + int pass_stride) { features[0] = pixel.x; features[1] = pixel.y; @@ -46,7 +50,11 @@ ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_rest } } -ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride) +ccl_device_inline void filter_get_feature_scales(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *scales, + const float *ccl_restrict mean, + int pass_stride) { scales[0] = fabsf(pixel.x - mean[0]); scales[1] = fabsf(pixel.y - mean[1]); @@ -70,19 +78,21 @@ ccl_device_inline void filter_calculate_scale(float *scale) scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); } -ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride) +ccl_device_inline float3 filter_get_pixel_color(const ccl_global float *ccl_restrict buffer, + int pass_stride) { return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)); } -ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride) +ccl_device_inline float filter_get_pixel_variance(const ccl_global float *ccl_restrict buffer, + int pass_stride) { return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2))); } ccl_device_inline void design_row_add(float *design_row, int rank, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict transform, int stride, int row, float feature) @@ -94,13 +104,13 @@ ccl_device_inline void design_row_add(float *design_row, /* Fill the design row. */ ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, - ccl_global float ccl_restrict_ptr p_buffer, + const ccl_global float *ccl_restrict p_buffer, int2 q_pixel, - ccl_global float ccl_restrict_ptr q_buffer, + const ccl_global float *ccl_restrict q_buffer, int pass_stride, int rank, float *design_row, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict transform, int stride) { design_row[0] = 1.0f; diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index a242a8ed0a1..ad0978d0c1d 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -33,7 +33,12 @@ CCL_NAMESPACE_BEGIN pixel_buffer += buffer_w - (pixel.x - low.x); \ } -ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride) +ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, + __m128 active_pixels, + const float *ccl_restrict buffer, + __m128 *features, + const __m128 ccl_restrict *mean, + int pass_stride) { features[0] = x; features[1] = y; @@ -53,7 +58,12 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active features[i] = _mm_mask_ps(features[i], active_pixels); } -ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride) +ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, + __m128 active_pixels, + const float *ccl_restrict buffer, + __m128 *scales, + const __m128 *ccl_restrict mean, + int pass_stride) { scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 1a314b100be..57222811992 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -16,7 +16,15 @@ CCL_NAMESPACE_BEGIN -ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2) +ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, + const float *ccl_restrict weightImage, + const float *ccl_restrict varianceImage, + float *differenceImage, + int4 rect, + int w, + int channel_offset, + float a, + float k_2) { for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x; x < rect.z; x++) { @@ -36,7 +44,11 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float c } } -ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) +ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differenceImage, + float *outImage, + int4 rect, + int w, + int f) { #ifdef __KERNEL_SSE3__ int aligned_lowx = (rect.x & ~(3)); @@ -65,7 +77,11 @@ ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceI } } -ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) +ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict differenceImage, + float *outImage, + int4 rect, + int w, + int f) { for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x; x < rect.z; x++) { @@ -90,7 +106,14 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr diff } } -ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f) +ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, + const float *ccl_restrict differenceImage, + const float *ccl_restrict image, + float *outImage, + float *accumImage, + int4 rect, + int w, + int f) { for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x; x < rect.z; x++) { @@ -108,8 +131,8 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl } ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, - float ccl_restrict_ptr differenceImage, - float ccl_restrict_ptr buffer, + const float *ccl_restrict differenceImage, + const float *ccl_restrict buffer, float *color_pass, float *variance_pass, float *transform, @@ -151,7 +174,10 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, } } -ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) +ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, + const float *ccl_restrict accumImage, + int4 rect, + int w) { for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x; x < rect.z; x++) { diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index b5ba7cf51a5..fd0a88340ea 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, int dy, - ccl_global float ccl_restrict_ptr weightImage, - ccl_global float ccl_restrict_ptr varianceImage, + const ccl_global float *ccl_restrict weightImage, + const ccl_global float *ccl_restrict varianceImage, ccl_global float *differenceImage, int4 rect, int w, int channel_offset, @@ -40,7 +40,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, } ccl_device_inline void kernel_filter_nlm_blur(int x, int y, - ccl_global float ccl_restrict_ptr differenceImage, + const ccl_global float *ccl_restrict differenceImage, ccl_global float *outImage, int4 rect, int w, int f) { @@ -55,7 +55,7 @@ ccl_device_inline void kernel_filter_nlm_blur(int x, int y, } ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, - ccl_global float ccl_restrict_ptr differenceImage, + const ccl_global float *ccl_restrict differenceImage, ccl_global float *outImage, int4 rect, int w, int f) { @@ -71,8 +71,8 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, int dx, int dy, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float ccl_restrict_ptr image, + const ccl_global float *ccl_restrict differenceImage, + const ccl_global float *ccl_restrict image, ccl_global float *outImage, ccl_global float *accumImage, int4 rect, int w, int f) @@ -95,11 +95,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, int dx, int dy, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float ccl_restrict_ptr buffer, + const ccl_global float *ccl_restrict differenceImage, + const ccl_global float *ccl_restrict buffer, ccl_global float *color_pass, ccl_global float *variance_pass, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict transform, ccl_global int *rank, ccl_global float *XtWX, ccl_global float3 *XtWY, @@ -138,7 +138,7 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, ccl_global float *outImage, - ccl_global float ccl_restrict_ptr accumImage, + const ccl_global float *ccl_restrict accumImage, int4 rect, int w) { outImage[y*w+x] /= accumImage[y*w+x]; diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index 252bcc5e675..82cc36625ec 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -44,7 +44,7 @@ ccl_device void kernel_filter_divide_shadow(int sample, int offset = tiles->offsets[tile]; int stride = tiles->strides[tile]; - ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile]; + const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile]; center_buffer += (y*stride + x + offset)*buffer_pass_stride; center_buffer += buffer_denoising_offset + 14; diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index 6a7c86e4012..13e90f8233b 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -21,10 +21,10 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, int dx, int dy, int w, int h, int pass_stride, - ccl_global float ccl_restrict_ptr buffer, + const ccl_global float *ccl_restrict buffer, ccl_global float *color_pass, ccl_global float *variance_pass, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict transform, ccl_global int *rank, float weight, ccl_global float *XtWX, diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h index 139dc402d21..4766f225fb1 100644 --- a/intern/cycles/kernel/filter/filter_transform.h +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, int x, int y, int4 rect, int pass_stride, float *transform, int *rank, @@ -29,7 +29,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, /* Temporary storage, used in different steps of the algorithm. */ float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; float tempvector[2*DENOISE_FEATURES]; - float ccl_restrict_ptr pixel_buffer; + const float *ccl_restrict pixel_buffer; int2 pixel; diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h index 68304e14143..2cd21224762 100644 --- a/intern/cycles/kernel/filter/filter_transform_gpu.h +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer, +ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, int x, int y, int4 rect, int pass_stride, ccl_global float *transform, @@ -38,7 +38,7 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - ccl_global float ccl_restrict_ptr pixel_buffer; + const ccl_global float *ccl_restrict pixel_buffer; int2 pixel; diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index ed3a92f6241..9de51e2d86c 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, int x, int y, int4 rect, int pass_stride, float *transform, int *rank, @@ -25,7 +25,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, int buffer_w = align_up(rect.z - rect.x, 4); __m128 features[DENOISE_FEATURES]; - float ccl_restrict_ptr pixel_buffer; + const float *ccl_restrict pixel_buffer; int2 pixel; int2 low = make_int2(max(rect.x, x - radius), diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 7595e74e2d5..21da180bb8e 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -42,8 +42,6 @@ #include "util/util_types.h" #include "util/util_texture.h" -#define ccl_restrict_ptr const * __restrict - #define ccl_addr_space #define ccl_local_id(d) 0 diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 80d7401fbcf..988126f90e1 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -55,7 +55,6 @@ #define ccl_restrict __restrict__ #define ccl_align(n) __align__(n) -#define ccl_restrict_ptr const * __restrict__ #define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH) diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 15cf4b81b21..c2263ac0d49 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -50,8 +50,6 @@ # define ccl_addr_space #endif -#define ccl_restrict_ptr const * __restrict__ - #define ccl_local_id(d) get_local_id(d) #define ccl_global_id(d) get_global_id(d) diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index f812a6601c6..30e1414f1e9 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -139,8 +139,8 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_calc_difference(int dx, int dy, - float ccl_restrict_ptr weightImage, - float ccl_restrict_ptr varianceImage, + const float *ccl_restrict weightImage, + const float *ccl_restrict varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, @@ -154,7 +154,7 @@ kernel_cuda_filter_nlm_calc_difference(int dx, int dy, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) { +kernel_cuda_filter_nlm_blur(const float *ccl_restrict differenceImage, float *outImage, int4 rect, int w, int f) { int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; if(x < rect.z && y < rect.w) { @@ -164,7 +164,7 @@ kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outIm extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) { +kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict differenceImage, float *outImage, int4 rect, int w, int f) { int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; if(x < rect.z && y < rect.w) { @@ -175,8 +175,8 @@ kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_update_output(int dx, int dy, - float ccl_restrict_ptr differenceImage, - float ccl_restrict_ptr image, + const float *ccl_restrict differenceImage, + const float *ccl_restrict image, float *outImage, float *accumImage, int4 rect, int w, int f) { @@ -189,7 +189,7 @@ kernel_cuda_filter_nlm_update_output(int dx, int dy, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) { +kernel_cuda_filter_nlm_normalize(float *outImage, const float *ccl_restrict accumImage, int4 rect, int w) { int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; if(x < rect.z && y < rect.w) { @@ -200,8 +200,8 @@ kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumIm extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, - float ccl_restrict_ptr differenceImage, - float ccl_restrict_ptr buffer, + const float *ccl_restrict differenceImage, + const float *ccl_restrict buffer, float *color_pass, float *variance_pass, float const* __restrict__ transform, diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index fbc3daa62b9..f7d177b45b0 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -106,7 +106,7 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, } } -__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer, +__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer, ccl_global float *transform, ccl_global int *rank, int4 filter_area, @@ -132,8 +132,8 @@ __kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restric __kernel void kernel_ocl_filter_nlm_calc_difference(int dx, int dy, - ccl_global float ccl_restrict_ptr weightImage, - ccl_global float ccl_restrict_ptr varianceImage, + const ccl_global float *ccl_restrict weightImage, + const ccl_global float *ccl_restrict varianceImage, ccl_global float *differenceImage, int4 rect, int w, @@ -147,7 +147,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(int dx, } } -__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage, +__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict differenceImage, ccl_global float *outImage, int4 rect, int w, @@ -159,7 +159,7 @@ __kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr diffe } } -__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage, +__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict differenceImage, ccl_global float *outImage, int4 rect, int w, @@ -173,8 +173,8 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_pt __kernel void kernel_ocl_filter_nlm_update_output(int dx, int dy, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float ccl_restrict_ptr image, + const ccl_global float *ccl_restrict differenceImage, + const ccl_global float *ccl_restrict image, ccl_global float *outImage, ccl_global float *accumImage, int4 rect, @@ -188,7 +188,7 @@ __kernel void kernel_ocl_filter_nlm_update_output(int dx, } __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage, - ccl_global float ccl_restrict_ptr accumImage, + const ccl_global float *ccl_restrict accumImage, int4 rect, int w) { int x = get_global_id(0) + rect.x; @@ -200,11 +200,11 @@ __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage, __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, int dy, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float ccl_restrict_ptr buffer, + const ccl_global float *ccl_restrict differenceImage, + const ccl_global float *ccl_restrict buffer, ccl_global float *color_pass, ccl_global float *variance_pass, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict transform, ccl_global int *rank, ccl_global float *XtWX, ccl_global float3 *XtWY, diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h index 2172e94a14f..0c58ae5058c 100644 --- a/intern/cycles/util/util_math_matrix.h +++ b/intern/cycles/util/util_math_matrix.h @@ -50,19 +50,19 @@ ccl_device_inline void math_matrix_zero(float *A, int n) /* Elementary vector operations. */ -ccl_device_inline void math_vector_add(float *a, float ccl_restrict_ptr b, int n) +ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n) { for(int i = 0; i < n; i++) a[i] += b[i]; } -ccl_device_inline void math_vector_mul(float *a, float ccl_restrict_ptr b, int n) +ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n) { for(int i = 0; i < n; i++) a[i] *= b[i]; } -ccl_device_inline void math_vector_mul_strided(ccl_global float *a, float ccl_restrict_ptr b, int astride, int n) +ccl_device_inline void math_vector_mul_strided(ccl_global float *a, const float *ccl_restrict b, int astride, int n) { for(int i = 0; i < n; i++) a[i*astride] *= b[i]; @@ -74,7 +74,7 @@ ccl_device_inline void math_vector_scale(float *a, float b, int n) a[i] *= b; } -ccl_device_inline void math_vector_max(float *a, float ccl_restrict_ptr b, int n) +ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n) { for(int i = 0; i < n; i++) a[i] = max(a[i], b[i]); @@ -105,7 +105,7 @@ ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, f * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ ccl_device_inline void math_matrix_add_gramian(float *A, int n, - float ccl_restrict_ptr v, + const float *ccl_restrict v, float weight) { for(int row = 0; row < n; row++) @@ -117,7 +117,7 @@ ccl_device_inline void math_matrix_add_gramian(float *A, * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A, int n, - float ccl_restrict_ptr v, + const float *ccl_restrict v, float weight, int stride) { @@ -342,32 +342,32 @@ ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n) /* Add Gramian matrix of v to A. * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */ -ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, __m128 ccl_restrict_ptr v, __m128 weight) +ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight) { for(int row = 0; row < n; row++) for(int col = 0; col <= row; col++) MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight)); } -ccl_device_inline void math_vector_add_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a) +ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a) { for(int i = 0; i < n; i++) V[i] = _mm_add_ps(V[i], a[i]); } -ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a) +ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a) { for(int i = 0; i < n; i++) V[i] = _mm_mul_ps(V[i], a[i]); } -ccl_device_inline void math_vector_max_sse(__m128 *a, __m128 ccl_restrict_ptr b, int n) +ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n) { for(int i = 0; i < n; i++) a[i] = _mm_max_ps(a[i], b[i]); } -ccl_device_inline void math_matrix_hsum(float *A, int n, __m128 ccl_restrict_ptr B) +ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B) { for(int row = 0; row < n; row++) for(int col = 0; col <= row; col++) |