From e12c08e8d170b7ca40f204a5b0423c23a9fbc2c1 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Wed, 17 Apr 2019 06:17:24 +0200 Subject: ClangFormat: apply to source, most of intern Apply clang format as proposed in T53211. For details on usage and instructions for migrating branches without conflicts, see: https://wiki.blender.org/wiki/Tools/ClangFormat --- intern/cycles/kernel/kernels/cpu/filter.cpp | 2 +- intern/cycles/kernel/kernels/cpu/filter_avx.cpp | 2 +- intern/cycles/kernel/kernels/cpu/filter_avx2.cpp | 2 +- intern/cycles/kernel/kernels/cpu/filter_cpu.h | 42 +- intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h | 229 ++--- intern/cycles/kernel/kernels/cpu/filter_sse2.cpp | 2 +- intern/cycles/kernel/kernels/cpu/filter_sse3.cpp | 2 +- intern/cycles/kernel/kernels/cpu/filter_sse41.cpp | 2 +- intern/cycles/kernel/kernels/cpu/kernel.cpp | 34 +- intern/cycles/kernel/kernels/cpu/kernel_avx.cpp | 2 +- intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp | 2 +- intern/cycles/kernel/kernels/cpu/kernel_cpu.h | 53 +- .../cycles/kernel/kernels/cpu/kernel_cpu_image.h | 965 ++++++++++----------- intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h | 182 ++-- intern/cycles/kernel/kernels/cpu/kernel_split.cpp | 2 +- .../cycles/kernel/kernels/cpu/kernel_split_avx.cpp | 2 +- .../kernel/kernels/cpu/kernel_split_avx2.cpp | 2 +- .../kernel/kernels/cpu/kernel_split_sse2.cpp | 2 +- .../kernel/kernels/cpu/kernel_split_sse3.cpp | 2 +- .../kernel/kernels/cpu/kernel_split_sse41.cpp | 2 +- intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp | 2 +- intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp | 2 +- intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp | 2 +- intern/cycles/kernel/kernels/cuda/kernel_config.h | 14 +- .../cycles/kernel/kernels/cuda/kernel_cuda_image.h | 237 +++-- .../kernel/kernels/opencl/kernel_opencl_image.h | 425 ++++----- .../kernel/kernels/opencl/kernel_split_function.h | 59 +- 27 files changed, 1116 insertions(+), 1158 deletions(-) (limited to 'intern/cycles/kernel/kernels') diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp index 2ff1a392dc3..145a6b6ac40 100644 --- a/intern/cycles/kernel/kernels/cpu/filter.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter.cpp @@ -53,7 +53,7 @@ /* quiet unused define warnings */ #if defined(__KERNEL_SSE2__) - /* do nothing */ +/* do nothing */ #endif #include "kernel/filter/filter.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp index 4a9e6047ecf..1d68214c8e7 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp @@ -32,7 +32,7 @@ # define __KERNEL_SSE41__ # define __KERNEL_AVX__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ #include "kernel/filter/filter.h" #define KERNEL_ARCH cpu_avx diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp index c22ec576254..b6709fbc529 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp @@ -33,7 +33,7 @@ # define __KERNEL_AVX__ # define __KERNEL_AVX2__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ #include "kernel/filter/filter.h" #define KERNEL_ARCH cpu_avx2 diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index 02c85562db8..1423b182ab8 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -25,7 +25,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, float *sampleV, float *sampleVV, float *bufferV, - int* prefilter_rect, + int *prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset); @@ -38,7 +38,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, float *mean, float *variance, float scale, - int* prefilter_rect, + int *prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset); @@ -49,9 +49,10 @@ void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample, float *from, float *buffer, int out_offset, - int* prefilter_rect); + int *prefilter_rect); -void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, +void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, + int y, ccl_global float *image, ccl_global float *variance, ccl_global float *depth, @@ -59,22 +60,17 @@ void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, int *rect, int pass_stride); -void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, - float *mean, - float *variance, - float *a, - float *b, - int* prefilter_rect, - int r); +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)( + int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r); -void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer, TileInfo *tiles, int x, int y, int storage_ofs, float *transform, int *rank, - int* rect, + int *rect, int pass_stride, int frame_stride, bool use_time, @@ -87,24 +83,18 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, float *variance_image, float *scale_image, float *difference_image, - int* rect, + int *rect, int stride, int channel_offset, int frame_offset, float a, float k_2); -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, - float *out_image, - int* rect, - int stride, - int f); +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)( + float *difference_image, float *out_image, int *rect, int stride, int f); -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, - float *out_image, - int* rect, - int stride, - int f); +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)( + float *difference_image, float *out_image, int *rect, int stride, int f); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, @@ -113,7 +103,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, float *temp_image, float *out_image, float *accum_image, - int* rect, + int *rect, int channel_offset, int stride, int f); @@ -137,7 +127,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, float *accum_image, - int* rect, + int *rect, int stride); void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index c29505880cb..3d4cb87e104 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -25,12 +25,12 @@ #include "kernel/filter/filter_kernel.h" #ifdef KERNEL_STUB -# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) +# define STUB_ASSERT(arch, name) \ + assert(!(#name " kernel stub for architecture " #arch " was called!")) #endif CCL_NAMESPACE_BEGIN - /* Denoise filter */ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, @@ -42,23 +42,25 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, float *sampleVariance, float *sampleVarianceV, float *bufferVariance, - int* prefilter_rect, + int *prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); + STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); #else - kernel_filter_divide_shadow(sample, tile_info, - x, y, - unfilteredA, - unfilteredB, - sampleVariance, - sampleVarianceV, - bufferVariance, - load_int4(prefilter_rect), - buffer_pass_stride, - buffer_denoising_offset); + kernel_filter_divide_shadow(sample, + tile_info, + x, + y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset); #endif } @@ -68,23 +70,28 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, int v_offset, int x, int y, - float *mean, float *variance, + float *mean, + float *variance, float scale, - int* prefilter_rect, + int *prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_get_feature); + STUB_ASSERT(KERNEL_ARCH, filter_get_feature); #else - kernel_filter_get_feature(sample, tile_info, - m_offset, v_offset, - x, y, - mean, variance, - scale, - load_int4(prefilter_rect), - buffer_pass_stride, - buffer_denoising_offset); + kernel_filter_get_feature(sample, + tile_info, + m_offset, + v_offset, + x, + y, + mean, + variance, + scale, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset); #endif } @@ -95,16 +102,18 @@ void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample, float *from, float *buffer, int out_offset, - int* prefilter_rect) + int *prefilter_rect) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_write_feature); + STUB_ASSERT(KERNEL_ARCH, filter_write_feature); #else - kernel_filter_write_feature(sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect)); + kernel_filter_write_feature( + sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect)); #endif } -void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, +void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, + int y, ccl_global float *image, ccl_global float *variance, ccl_global float *depth, @@ -113,35 +122,31 @@ void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, int pass_stride) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers); + STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers); #else - kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride); + kernel_filter_detect_outliers( + x, y, image, variance, depth, output, load_int4(rect), pass_stride); #endif } -void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, - float *mean, - float *variance, - float *a, - float *b, - int* prefilter_rect, - int r) +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)( + int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_combine_halves); + STUB_ASSERT(KERNEL_ARCH, filter_combine_halves); #else - kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r); + kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r); #endif } -void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer, TileInfo *tile_info, int x, int y, int storage_ofs, float *transform, int *rank, - int* prefilter_rect, + int *prefilter_rect, int pass_stride, int frame_stride, bool use_time, @@ -149,21 +154,22 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, float pca_threshold) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); + STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); #else - rank += storage_ofs; - transform += storage_ofs*TRANSFORM_SIZE; - kernel_filter_construct_transform(buffer, - tile_info, - x, y, - load_int4(prefilter_rect), - pass_stride, - frame_stride, - use_time, - transform, - rank, - radius, - pca_threshold); + rank += storage_ofs; + transform += storage_ofs * TRANSFORM_SIZE; + kernel_filter_construct_transform(buffer, + tile_info, + x, + y, + load_int4(prefilter_rect), + pass_stride, + frame_stride, + use_time, + transform, + rank, + radius, + pca_threshold); #endif } @@ -181,44 +187,40 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, float k_2) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); #else - kernel_filter_nlm_calc_difference(dx, dy, - weight_image, - variance_image, - scale_image, - difference_image, - load_int4(rect), - stride, - channel_offset, - frame_offset, - a, k_2); + kernel_filter_nlm_calc_difference(dx, + dy, + weight_image, + variance_image, + scale_image, + difference_image, + load_int4(rect), + stride, + channel_offset, + frame_offset, + a, + k_2); #endif } -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, - float *out_image, - int *rect, - int stride, - int f) +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)( + float *difference_image, float *out_image, int *rect, int stride, int f) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); + STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); #else - kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f); + kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f); #endif } -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, - float *out_image, - int *rect, - int stride, - int f) +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)( + float *difference_image, float *out_image, int *rect, int stride, int f) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); #else - kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f); + kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f); #endif } @@ -235,17 +237,19 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int f) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); + STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); #else - kernel_filter_nlm_update_output(dx, dy, - difference_image, - image, - temp_image, - out_image, - accum_image, - load_int4(rect), - channel_offset, - stride, f); + kernel_filter_nlm_update_output(dx, + dy, + difference_image, + image, + temp_image, + out_image, + accum_image, + load_int4(rect), + channel_offset, + stride, + f); #endif } @@ -267,19 +271,24 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, bool use_time) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); + STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); #else - kernel_filter_nlm_construct_gramian(dx, dy, t, - difference_image, - buffer, - transform, rank, - XtWX, XtWY, - load_int4(rect), - load_int4(filter_window), - stride, f, - pass_stride, - frame_offset, - use_time); + kernel_filter_nlm_construct_gramian(dx, + dy, + t, + difference_image, + buffer, + transform, + rank, + XtWX, + XtWY, + load_int4(rect), + load_int4(filter_window), + stride, + f, + pass_stride, + frame_offset, + use_time); #endif } @@ -289,9 +298,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, int stride) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); + STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); #else - kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride); + kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride); #endif } @@ -306,12 +315,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, int sample) { #ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_finalize); + STUB_ASSERT(KERNEL_ARCH, filter_finalize); #else - XtWX += storage_ofs*XTWX_SIZE; - XtWY += storage_ofs*XTWY_SIZE; - rank += storage_ofs; - kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); + XtWX += storage_ofs * XTWX_SIZE; + XtWY += storage_ofs * XTWY_SIZE; + rank += storage_ofs; + kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); #endif } diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp index f7c9935f1d0..6c6c3e78696 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp @@ -27,7 +27,7 @@ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) # define __KERNEL_SSE2__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ #include "kernel/filter/filter.h" #define KERNEL_ARCH cpu_sse2 diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp index 070b95a3505..e2243000331 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp @@ -29,7 +29,7 @@ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ #include "kernel/filter/filter.h" #define KERNEL_ARCH cpu_sse3 diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp index 254025be4e2..068889365e3 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -31,7 +31,7 @@ # define __KERNEL_SSSE3__ # define __KERNEL_SSE41__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ #include "kernel/filter/filter.h" #define KERNEL_ARCH cpu_sse41 diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp index de487f6123f..f2146302a27 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp @@ -53,7 +53,7 @@ /* quiet unused define warnings */ #if defined(__KERNEL_SSE2__) - /* do nothing */ +/* do nothing */ #endif #include "kernel/kernel.h" @@ -66,29 +66,27 @@ CCL_NAMESPACE_BEGIN void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size) { - if(strcmp(name, "__data") == 0) - memcpy(&kg->__data, host, size); - else - assert(0); + if (strcmp(name, "__data") == 0) + memcpy(&kg->__data, host, size); + else + assert(0); } -void kernel_tex_copy(KernelGlobals *kg, - const char *name, - void *mem, - size_t size) +void kernel_tex_copy(KernelGlobals *kg, const char *name, void *mem, size_t size) { - if(0) { - } + if (0) { + } #define KERNEL_TEX(type, tname) \ - else if(strcmp(name, #tname) == 0) { \ - kg->tname.data = (type*)mem; \ - kg->tname.width = size; \ - } + else if (strcmp(name, #tname) == 0) \ + { \ + kg->tname.data = (type *)mem; \ + kg->tname.width = size; \ + } #include "kernel/kernel_textures.h" - else { - assert(0); - } + else { + assert(0); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp index a645fb4d8dd..0656fc9dd00 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp @@ -32,7 +32,7 @@ # define __KERNEL_SSE41__ # define __KERNEL_AVX__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ #include "kernel/kernel.h" #define KERNEL_ARCH cpu_avx diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp index 6bbb87727b9..5baafdc699e 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp @@ -33,7 +33,7 @@ # define __KERNEL_AVX__ # define __KERNEL_AVX2__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ #include "kernel/kernel.h" #define KERNEL_ARCH cpu_avx2 diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 6bdb8546a24..f5d981fb71a 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -16,25 +16,24 @@ /* Templated common declaration part of all CPU kernels. */ -void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, - float *buffer, - int sample, - int x, int y, - int offset, - int stride); +void KERNEL_FUNCTION_FULL_NAME(path_trace)( + KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride); void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, - int x, int y, - int offset, int stride); + int x, + int y, + int offset, + int stride); void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, - int x, int y, + int x, + int y, int offset, int stride); @@ -49,24 +48,28 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, /* Split kernels */ -void KERNEL_FUNCTION_FULL_NAME(data_init)( - KernelGlobals *kg, - ccl_constant KernelData *data, - ccl_global void *split_data_buffer, - int num_elements, - ccl_global char *ray_state, - int start_sample, - int end_sample, - int sx, int sy, int sw, int sh, int offset, int stride, - ccl_global int *Queue_index, - int queuesize, - ccl_global char *use_queues_flag, - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, - ccl_global float *buffer); +void KERNEL_FUNCTION_FULL_NAME(data_init)(KernelGlobals *kg, + ccl_constant KernelData *data, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + int start_sample, + int end_sample, + int sx, + int sy, + int sw, + int sh, + int offset, + int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer); #define DECLARE_SPLIT_KERNEL_FUNCTION(name) \ - void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data); + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * data); DECLARE_SPLIT_KERNEL_FUNCTION(path_init) DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect) diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h index ae4fd85780d..4289e2bbb85 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h @@ -19,523 +19,508 @@ CCL_NAMESPACE_BEGIN -template struct TextureInterpolator { +template struct TextureInterpolator { #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ - { \ - u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ - u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ - u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ - u[3] = (1.0f / 6.0f) * t * t * t; \ - } (void) 0 - - static ccl_always_inline float4 read(float4 r) - { - return r; - } - - static ccl_always_inline float4 read(uchar4 r) - { - float f = 1.0f / 255.0f; - return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); - } - - static ccl_always_inline float4 read(uchar r) - { - float f = r * (1.0f / 255.0f); - return make_float4(f, f, f, 1.0f); - } - - static ccl_always_inline float4 read(float r) - { - /* TODO(dingto): Optimize this, so interpolation - * happens on float instead of float4 */ - return make_float4(r, r, r, 1.0f); - } - - static ccl_always_inline float4 read(half4 r) - { - return half4_to_float4(r); - } - - static ccl_always_inline float4 read(half r) - { - float f = half_to_float(r); - return make_float4(f, f, f, 1.0f); - } - - static ccl_always_inline float4 read(uint16_t r) - { - float f = r*(1.0f/65535.0f); - return make_float4(f, f, f, 1.0f); - } - - static ccl_always_inline float4 read(ushort4 r) - { - float f = 1.0f/65535.0f; - return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); - } - - static ccl_always_inline float4 read(const T *data, - int x, int y, - int width, int height) - { - if(x < 0 || y < 0 || x >= width || y >= height) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - return read(data[y * width + x]); - } - - static ccl_always_inline int wrap_periodic(int x, int width) - { - x %= width; - if(x < 0) - x += width; - return x; - } - - static ccl_always_inline int wrap_clamp(int x, int width) - { - return clamp(x, 0, width-1); - } - - static ccl_always_inline float frac(float x, int *ix) - { - int i = float_to_int(x) - ((x < 0.0f)? 1: 0); - *ix = i; - return x - (float)i; - } - - /* ******** 2D interpolation ******** */ - - static ccl_always_inline float4 interp_closest(const TextureInfo& info, - float x, float y) - { - const T *data = (const T*)info.data; - const int width = info.width; - const int height = info.height; - int ix, iy; - frac(x*(float)width, &ix); - frac(y*(float)height, &iy); - switch(info.extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - ATTR_FALLTHROUGH; - case EXTENSION_EXTEND: - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - return read(data[ix + iy*width]); - } - - static ccl_always_inline float4 interp_linear(const TextureInfo& info, - float x, float y) - { - const T *data = (const T*)info.data; - const int width = info.width; - const int height = info.height; - int ix, iy, nix, niy; - const float tx = frac(x*(float)width - 0.5f, &ix); - const float ty = frac(y*(float)height - 0.5f, &iy); - switch(info.extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - break; - case EXTENSION_CLIP: - nix = ix + 1; - niy = iy + 1; - break; - case EXTENSION_EXTEND: - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) + - (1.0f - ty) * tx * read(data, nix, iy, width, height) + - ty * (1.0f - tx) * read(data, ix, niy, width, height) + - ty * tx * read(data, nix, niy, width, height); - } - - static ccl_always_inline float4 interp_cubic(const TextureInfo& info, - float x, float y) - { - const T *data = (const T*)info.data; - const int width = info.width; - const int height = info.height; - int ix, iy, nix, niy; - const float tx = frac(x*(float)width - 0.5f, &ix); - const float ty = frac(y*(float)height - 0.5f, &iy); - int pix, piy, nnix, nniy; - switch(info.extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - pix = wrap_periodic(ix-1, width); - piy = wrap_periodic(iy-1, height); - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - nnix = wrap_periodic(ix+2, width); - nniy = wrap_periodic(iy+2, height); - break; - case EXTENSION_CLIP: - pix = ix - 1; - piy = iy - 1; - nix = ix + 1; - niy = iy + 1; - nnix = ix + 2; - nniy = iy + 2; - break; - case EXTENSION_EXTEND: - pix = wrap_clamp(ix-1, width); - piy = wrap_clamp(iy-1, height); - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - nnix = wrap_clamp(ix+2, width); - nniy = wrap_clamp(iy+2, height); - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - const int xc[4] = {pix, ix, nix, nnix}; - const int yc[4] = {piy, iy, niy, nniy}; - float u[4], v[4]; - /* Some helper macro to keep code reasonable size, - * let compiler to inline all the matrix multiplications. - */ + { \ + u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \ + u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \ + u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } \ + (void)0 + + static ccl_always_inline float4 read(float4 r) + { + return r; + } + + static ccl_always_inline float4 read(uchar4 r) + { + float f = 1.0f / 255.0f; + return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); + } + + static ccl_always_inline float4 read(uchar r) + { + float f = r * (1.0f / 255.0f); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline float4 read(float r) + { + /* TODO(dingto): Optimize this, so interpolation + * happens on float instead of float4 */ + return make_float4(r, r, r, 1.0f); + } + + static ccl_always_inline float4 read(half4 r) + { + return half4_to_float4(r); + } + + static ccl_always_inline float4 read(half r) + { + float f = half_to_float(r); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline float4 read(uint16_t r) + { + float f = r * (1.0f / 65535.0f); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline float4 read(ushort4 r) + { + float f = 1.0f / 65535.0f; + return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); + } + + static ccl_always_inline float4 read(const T *data, int x, int y, int width, int height) + { + if (x < 0 || y < 0 || x >= width || y >= height) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return read(data[y * width + x]); + } + + static ccl_always_inline int wrap_periodic(int x, int width) + { + x %= width; + if (x < 0) + x += width; + return x; + } + + static ccl_always_inline int wrap_clamp(int x, int width) + { + return clamp(x, 0, width - 1); + } + + static ccl_always_inline float frac(float x, int *ix) + { + int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0); + *ix = i; + return x - (float)i; + } + + /* ******** 2D interpolation ******** */ + + static ccl_always_inline float4 interp_closest(const TextureInfo &info, float x, float y) + { + const T *data = (const T *)info.data; + const int width = info.width; + const int height = info.height; + int ix, iy; + frac(x * (float)width, &ix); + frac(y * (float)height, &iy); + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + break; + case EXTENSION_CLIP: + if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return read(data[ix + iy * width]); + } + + static ccl_always_inline float4 interp_linear(const TextureInfo &info, float x, float y) + { + const T *data = (const T *)info.data; + const int width = info.width; + const int height = info.height; + int ix, iy, nix, niy; + const float tx = frac(x * (float)width - 0.5f, &ix); + const float ty = frac(y * (float)height - 0.5f, &iy); + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + nix = wrap_periodic(ix + 1, width); + niy = wrap_periodic(iy + 1, height); + break; + case EXTENSION_CLIP: + nix = ix + 1; + niy = iy + 1; + break; + case EXTENSION_EXTEND: + nix = wrap_clamp(ix + 1, width); + niy = wrap_clamp(iy + 1, height); + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) + + (1.0f - ty) * tx * read(data, nix, iy, width, height) + + ty * (1.0f - tx) * read(data, ix, niy, width, height) + + ty * tx * read(data, nix, niy, width, height); + } + + static ccl_always_inline float4 interp_cubic(const TextureInfo &info, float x, float y) + { + const T *data = (const T *)info.data; + const int width = info.width; + const int height = info.height; + int ix, iy, nix, niy; + const float tx = frac(x * (float)width - 0.5f, &ix); + const float ty = frac(y * (float)height - 0.5f, &iy); + int pix, piy, nnix, nniy; + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + pix = wrap_periodic(ix - 1, width); + piy = wrap_periodic(iy - 1, height); + nix = wrap_periodic(ix + 1, width); + niy = wrap_periodic(iy + 1, height); + nnix = wrap_periodic(ix + 2, width); + nniy = wrap_periodic(iy + 2, height); + break; + case EXTENSION_CLIP: + pix = ix - 1; + piy = iy - 1; + nix = ix + 1; + niy = iy + 1; + nnix = ix + 2; + nniy = iy + 2; + break; + case EXTENSION_EXTEND: + pix = wrap_clamp(ix - 1, width); + piy = wrap_clamp(iy - 1, height); + nix = wrap_clamp(ix + 1, width); + niy = wrap_clamp(iy + 1, height); + nnix = wrap_clamp(ix + 2, width); + nniy = wrap_clamp(iy + 2, height); + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {piy, iy, niy, nniy}; + float u[4], v[4]; + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ #define DATA(x, y) (read(data, xc[x], yc[y], width, height)) #define TERM(col) \ - (v[col] * (u[0] * DATA(0, col) + \ - u[1] * DATA(1, col) + \ - u[2] * DATA(2, col) + \ - u[3] * DATA(3, col))) + (v[col] * \ + (u[0] * DATA(0, col) + u[1] * DATA(1, col) + u[2] * DATA(2, col) + u[3] * DATA(3, col))) - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); - /* Actual interpolation. */ - return TERM(0) + TERM(1) + TERM(2) + TERM(3); + /* Actual interpolation. */ + return TERM(0) + TERM(1) + TERM(2) + TERM(3); #undef TERM #undef DATA - } - - static ccl_always_inline float4 interp(const TextureInfo& info, - float x, float y) - { - if(UNLIKELY(!info.data)) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - switch(info.interpolation) { - case INTERPOLATION_CLOSEST: - return interp_closest(info, x, y); - case INTERPOLATION_LINEAR: - return interp_linear(info, x, y); - default: - return interp_cubic(info, x, y); - } - } - - /* ******** 3D interpolation ******** */ - - static ccl_always_inline float4 interp_3d_closest(const TextureInfo& info, - float x, float y, float z) - { - int width = info.width; - int height = info.height; - int depth = info.depth; - int ix, iy, iz; - - frac(x*(float)width, &ix); - frac(y*(float)height, &iy); - frac(z*(float)depth, &iz); - - switch(info.extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - ATTR_FALLTHROUGH; - case EXTENSION_EXTEND: - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - const T *data = (const T*)info.data; - return read(data[ix + iy*width + iz*width*height]); - } - - static ccl_always_inline float4 interp_3d_linear(const TextureInfo& info, - float x, float y, float z) - { - int width = info.width; - int height = info.height; - int depth = info.depth; - int ix, iy, iz; - int nix, niy, niz; - - float tx = frac(x*(float)width - 0.5f, &ix); - float ty = frac(y*(float)height - 0.5f, &iy); - float tz = frac(z*(float)depth - 0.5f, &iz); - - switch(info.extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - niz = wrap_periodic(iz+1, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - ATTR_FALLTHROUGH; - case EXTENSION_EXTEND: - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - niz = wrap_clamp(iz+1, depth); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - const T *data = (const T*)info.data; - float4 r; - - r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]); - r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]); - r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]); - r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]); - - r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]); - r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]); - r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]); - r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]); - - return r; - } - - /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are - * causing stack overflow issue in this function unless it is inlined. - * - * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization - * enabled. - */ + } + + static ccl_always_inline float4 interp(const TextureInfo &info, float x, float y) + { + if (UNLIKELY(!info.data)) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + switch (info.interpolation) { + case INTERPOLATION_CLOSEST: + return interp_closest(info, x, y); + case INTERPOLATION_LINEAR: + return interp_linear(info, x, y); + default: + return interp_cubic(info, x, y); + } + } + + /* ******** 3D interpolation ******** */ + + static ccl_always_inline float4 interp_3d_closest(const TextureInfo &info, + float x, + float y, + float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + + frac(x * (float)width, &ix); + frac(y * (float)height, &iy); + frac(z * (float)depth, &iz); + + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + break; + case EXTENSION_CLIP: + if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const T *data = (const T *)info.data; + return read(data[ix + iy * width + iz * width * height]); + } + + static ccl_always_inline float4 interp_3d_linear(const TextureInfo &info, + float x, + float y, + float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + int nix, niy, niz; + + float tx = frac(x * (float)width - 0.5f, &ix); + float ty = frac(y * (float)height - 0.5f, &iy); + float tz = frac(z * (float)depth - 0.5f, &iz); + + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + nix = wrap_periodic(ix + 1, width); + niy = wrap_periodic(iy + 1, height); + niz = wrap_periodic(iz + 1, depth); + break; + case EXTENSION_CLIP: + if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + nix = wrap_clamp(ix + 1, width); + niy = wrap_clamp(iy + 1, height); + niz = wrap_clamp(iz + 1, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const T *data = (const T *)info.data; + float4 r; + + r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * + read(data[ix + iy * width + iz * width * height]); + r += (1.0f - tz) * (1.0f - ty) * tx * read(data[nix + iy * width + iz * width * height]); + r += (1.0f - tz) * ty * (1.0f - tx) * read(data[ix + niy * width + iz * width * height]); + r += (1.0f - tz) * ty * tx * read(data[nix + niy * width + iz * width * height]); + + r += tz * (1.0f - ty) * (1.0f - tx) * read(data[ix + iy * width + niz * width * height]); + r += tz * (1.0f - ty) * tx * read(data[nix + iy * width + niz * width * height]); + r += tz * ty * (1.0f - tx) * read(data[ix + niy * width + niz * width * height]); + r += tz * ty * tx * read(data[nix + niy * width + niz * width * height]); + + return r; + } + + /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are + * causing stack overflow issue in this function unless it is inlined. + * + * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization + * enabled. + */ #if defined(__GNUC__) || defined(__clang__) - static ccl_always_inline + static ccl_always_inline #else - static ccl_never_inline + static ccl_never_inline #endif - float4 interp_3d_tricubic(const TextureInfo& info, float x, float y, float z) - { - int width = info.width; - int height = info.height; - int depth = info.depth; - int ix, iy, iz; - int nix, niy, niz; - /* Tricubic b-spline interpolation. */ - const float tx = frac(x*(float)width - 0.5f, &ix); - const float ty = frac(y*(float)height - 0.5f, &iy); - const float tz = frac(z*(float)depth - 0.5f, &iz); - int pix, piy, piz, nnix, nniy, nniz; - - switch(info.extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - - pix = wrap_periodic(ix-1, width); - piy = wrap_periodic(iy-1, height); - piz = wrap_periodic(iz-1, depth); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - niz = wrap_periodic(iz+1, depth); - - nnix = wrap_periodic(ix+2, width); - nniy = wrap_periodic(iy+2, height); - nniz = wrap_periodic(iz+2, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - ATTR_FALLTHROUGH; - case EXTENSION_EXTEND: - pix = wrap_clamp(ix-1, width); - piy = wrap_clamp(iy-1, height); - piz = wrap_clamp(iz-1, depth); - - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - niz = wrap_clamp(iz+1, depth); - - nnix = wrap_clamp(ix+2, width); - nniy = wrap_clamp(iy+2, height); - nniz = wrap_clamp(iz+2, depth); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - const int xc[4] = {pix, ix, nix, nnix}; - const int yc[4] = {width * piy, - width * iy, - width * niy, - width * nniy}; - const int zc[4] = {width * height * piz, - width * height * iz, - width * height * niz, - width * height * nniz}; - float u[4], v[4], w[4]; - - /* Some helper macro to keep code reasonable size, - * let compiler to inline all the matrix multiplications. - */ + float4 + interp_3d_tricubic(const TextureInfo &info, float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + int nix, niy, niz; + /* Tricubic b-spline interpolation. */ + const float tx = frac(x * (float)width - 0.5f, &ix); + const float ty = frac(y * (float)height - 0.5f, &iy); + const float tz = frac(z * (float)depth - 0.5f, &iz); + int pix, piy, piz, nnix, nniy, nniz; + + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + pix = wrap_periodic(ix - 1, width); + piy = wrap_periodic(iy - 1, height); + piz = wrap_periodic(iz - 1, depth); + + nix = wrap_periodic(ix + 1, width); + niy = wrap_periodic(iy + 1, height); + niz = wrap_periodic(iz + 1, depth); + + nnix = wrap_periodic(ix + 2, width); + nniy = wrap_periodic(iy + 2, height); + nniz = wrap_periodic(iz + 2, depth); + break; + case EXTENSION_CLIP: + if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + pix = wrap_clamp(ix - 1, width); + piy = wrap_clamp(iy - 1, height); + piz = wrap_clamp(iz - 1, depth); + + nix = wrap_clamp(ix + 1, width); + niy = wrap_clamp(iy + 1, height); + niz = wrap_clamp(iz + 1, depth); + + nnix = wrap_clamp(ix + 2, width); + nniy = wrap_clamp(iy + 2, height); + nniz = wrap_clamp(iz + 2, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, width * iy, width * niy, width * nniy}; + const int zc[4] = { + width * height * piz, width * height * iz, width * height * niz, width * height * nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ #define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) #define COL_TERM(col, row) \ - (v[col] * (u[0] * DATA(0, col, row) + \ - u[1] * DATA(1, col, row) + \ - u[2] * DATA(2, col, row) + \ - u[3] * DATA(3, col, row))) + (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) #define ROW_TERM(row) \ - (w[row] * (COL_TERM(0, row) + \ - COL_TERM(1, row) + \ - COL_TERM(2, row) + \ - COL_TERM(3, row))) + (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row))) - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - SET_CUBIC_SPLINE_WEIGHTS(w, tz); + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); - /* Actual interpolation. */ - const T *data = (const T*)info.data; - return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + /* Actual interpolation. */ + const T *data = (const T *)info.data; + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); #undef COL_TERM #undef ROW_TERM #undef DATA - } - - static ccl_always_inline float4 interp_3d(const TextureInfo& info, - float x, float y, float z, - InterpolationType interp) - { - if(UNLIKELY(!info.data)) - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - switch((interp == INTERPOLATION_NONE)? info.interpolation: interp) { - case INTERPOLATION_CLOSEST: - return interp_3d_closest(info, x, y, z); - case INTERPOLATION_LINEAR: - return interp_3d_linear(info, x, y, z); - default: - return interp_3d_tricubic(info, x, y, z); - } - } + } + + static ccl_always_inline float4 + interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp) + { + if (UNLIKELY(!info.data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) { + case INTERPOLATION_CLOSEST: + return interp_3d_closest(info, x, y, z); + case INTERPOLATION_LINEAR: + return interp_3d_linear(info, x, y, z); + default: + return interp_3d_tricubic(info, x, y, z); + } + } #undef SET_CUBIC_SPLINE_WEIGHTS }; ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { - const TextureInfo& info = kernel_tex_fetch(__texture_info, id); - - switch(kernel_tex_type(id)) { - case IMAGE_DATA_TYPE_HALF: - return TextureInterpolator::interp(info, x, y); - case IMAGE_DATA_TYPE_BYTE: - return TextureInterpolator::interp(info, x, y); - case IMAGE_DATA_TYPE_USHORT: - return TextureInterpolator::interp(info, x, y); - case IMAGE_DATA_TYPE_FLOAT: - return TextureInterpolator::interp(info, x, y); - case IMAGE_DATA_TYPE_HALF4: - return TextureInterpolator::interp(info, x, y); - case IMAGE_DATA_TYPE_BYTE4: - return TextureInterpolator::interp(info, x, y); - case IMAGE_DATA_TYPE_USHORT4: - return TextureInterpolator::interp(info, x, y); - case IMAGE_DATA_TYPE_FLOAT4: - return TextureInterpolator::interp(info, x, y); - default: - assert(0); - return make_float4(TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A); - } + const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + + switch (kernel_tex_type(id)) { + case IMAGE_DATA_TYPE_HALF: + return TextureInterpolator::interp(info, x, y); + case IMAGE_DATA_TYPE_BYTE: + return TextureInterpolator::interp(info, x, y); + case IMAGE_DATA_TYPE_USHORT: + return TextureInterpolator::interp(info, x, y); + case IMAGE_DATA_TYPE_FLOAT: + return TextureInterpolator::interp(info, x, y); + case IMAGE_DATA_TYPE_HALF4: + return TextureInterpolator::interp(info, x, y); + case IMAGE_DATA_TYPE_BYTE4: + return TextureInterpolator::interp(info, x, y); + case IMAGE_DATA_TYPE_USHORT4: + return TextureInterpolator::interp(info, x, y); + case IMAGE_DATA_TYPE_FLOAT4: + return TextureInterpolator::interp(info, x, y); + default: + assert(0); + return make_float4( + TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A); + } } -ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp) +ccl_device float4 kernel_tex_image_interp_3d( + KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp) { - const TextureInfo& info = kernel_tex_fetch(__texture_info, id); - - switch(kernel_tex_type(id)) { - case IMAGE_DATA_TYPE_HALF: - return TextureInterpolator::interp_3d(info, x, y, z, interp); - case IMAGE_DATA_TYPE_BYTE: - return TextureInterpolator::interp_3d(info, x, y, z, interp); - case IMAGE_DATA_TYPE_USHORT: - return TextureInterpolator::interp_3d(info, x, y, z, interp); - case IMAGE_DATA_TYPE_FLOAT: - return TextureInterpolator::interp_3d(info, x, y, z, interp); - case IMAGE_DATA_TYPE_HALF4: - return TextureInterpolator::interp_3d(info, x, y, z, interp); - case IMAGE_DATA_TYPE_BYTE4: - return TextureInterpolator::interp_3d(info, x, y, z, interp); - case IMAGE_DATA_TYPE_USHORT4: - return TextureInterpolator::interp_3d(info, x, y, z, interp); - case IMAGE_DATA_TYPE_FLOAT4: - return TextureInterpolator::interp_3d(info, x, y, z, interp); - default: - assert(0); - return make_float4(TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A); - } + const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + + switch (kernel_tex_type(id)) { + case IMAGE_DATA_TYPE_HALF: + return TextureInterpolator::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_BYTE: + return TextureInterpolator::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_USHORT: + return TextureInterpolator::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_FLOAT: + return TextureInterpolator::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_HALF4: + return TextureInterpolator::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_BYTE4: + return TextureInterpolator::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_USHORT4: + return TextureInterpolator::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_FLOAT4: + return TextureInterpolator::interp_3d(info, x, y, z, interp); + default: + assert(0); + return make_float4( + TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 759b7e4c20d..9ca3f46b5b6 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -58,14 +58,15 @@ # include "kernel/split/kernel_next_iteration_setup.h" # include "kernel/split/kernel_indirect_subsurface.h" # include "kernel/split/kernel_buffer_update.h" -# endif /* __SPLIT_KERNEL__ */ +# endif /* __SPLIT_KERNEL__ */ #else -# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) +# define STUB_ASSERT(arch, name) \ + assert(!(#name " kernel stub for architecture " #arch " was called!")) # ifdef __SPLIT_KERNEL__ # include "kernel/split/kernel_data_init.h" -# endif /* __SPLIT_KERNEL__ */ -#endif /* KERNEL_STUB */ +# endif /* __SPLIT_KERNEL__ */ +#endif /* KERNEL_STUB */ CCL_NAMESPACE_BEGIN @@ -73,31 +74,22 @@ CCL_NAMESPACE_BEGIN /* Path Tracing */ -void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, - float *buffer, - int sample, - int x, int y, - int offset, - int stride) +void KERNEL_FUNCTION_FULL_NAME(path_trace)( + KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride) { -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, path_trace); -#else -# ifdef __BRANCHED_PATH__ - if(kernel_data.integrator.branched) { - kernel_branched_path_trace(kg, - buffer, - sample, - x, y, - offset, - stride); - } - else -# endif - { - kernel_path_trace(kg, buffer, sample, x, y, offset, stride); - } -#endif /* KERNEL_STUB */ +# ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, path_trace); +# else +# ifdef __BRANCHED_PATH__ + if (kernel_data.integrator.branched) { + kernel_branched_path_trace(kg, buffer, sample, x, y, offset, stride); + } + else +# endif + { + kernel_path_trace(kg, buffer, sample, x, y, offset, stride); + } +# endif /* KERNEL_STUB */ } /* Film */ @@ -106,42 +98,32 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, - int x, int y, + int x, + int y, int offset, int stride) { -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, convert_to_byte); -#else - kernel_film_convert_to_byte(kg, - rgba, - buffer, - sample_scale, - x, y, - offset, - stride); -#endif /* KERNEL_STUB */ +# ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_byte); +# else + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); +# endif /* KERNEL_STUB */ } void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, - int x, int y, + int x, + int y, int offset, int stride) { -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, convert_to_half_float); -#else - kernel_film_convert_to_half_float(kg, - rgba, - buffer, - sample_scale, - x, y, - offset, - stride); -#endif /* KERNEL_STUB */ +# ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_half_float); +# else + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); +# endif /* KERNEL_STUB */ } /* Shader Evaluate */ @@ -155,60 +137,53 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, int offset, int sample) { -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, shader); -#else - if(type >= SHADER_EVAL_BAKE) { -# ifdef __BAKING__ - kernel_bake_evaluate(kg, - input, - output, - (ShaderEvalType)type, - filter, - i, - offset, - sample); -# endif - } - else if(type == SHADER_EVAL_DISPLACE) { - kernel_displace_evaluate(kg, input, output, i); - } - else { - kernel_background_evaluate(kg, input, output, i); - } -#endif /* KERNEL_STUB */ +# ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader); +# else + if (type >= SHADER_EVAL_BAKE) { +# ifdef __BAKING__ + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, filter, i, offset, sample); +# endif + } + else if (type == SHADER_EVAL_DISPLACE) { + kernel_displace_evaluate(kg, input, output, i); + } + else { + kernel_background_evaluate(kg, input, output, i); + } +# endif /* KERNEL_STUB */ } -#else /* __SPLIT_KERNEL__ */ +#else /* __SPLIT_KERNEL__ */ /* Split Kernel Path Tracing */ -#ifdef KERNEL_STUB -# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ - void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ - { \ - STUB_ASSERT(KERNEL_ARCH, name); \ - } - -# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ - void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ - { \ - STUB_ASSERT(KERNEL_ARCH, name); \ - } -#else -# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ - void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ - { \ - kernel_##name(kg); \ - } - -# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ - void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ - { \ - ccl_local type locals; \ - kernel_##name(kg, &locals); \ - } -#endif /* KERNEL_STUB */ +# ifdef KERNEL_STUB +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } + +# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } +# else +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \ + { \ + kernel_##name(kg); \ + } + +# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \ + { \ + ccl_local type locals; \ + kernel_##name(kg, &locals); \ + } +# endif /* KERNEL_STUB */ DEFINE_SPLIT_KERNEL_FUNCTION(path_init) DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) @@ -219,7 +194,8 @@ DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, + BackgroundAOLocals) DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) @@ -228,7 +204,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) -#endif /* __SPLIT_KERNEL__ */ +#endif /* __SPLIT_KERNEL__ */ #undef KERNEL_STUB #undef STUB_ASSERT diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp index c5e199b0a69..989f5e5aaa8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp @@ -54,7 +54,7 @@ /* quiet unused define warnings */ #if defined(__KERNEL_SSE2__) - /* do nothing */ +/* do nothing */ #endif #include "kernel/kernel.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp index 6ba3425a343..1b2e2516751 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp @@ -34,7 +34,7 @@ # define __KERNEL_SSE41__ # define __KERNEL_AVX__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ #include "kernel/kernel.h" #define KERNEL_ARCH cpu_avx diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp index 76b2d77ebb8..43b8bfbf864 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp @@ -35,7 +35,7 @@ # define __KERNEL_AVX__ # define __KERNEL_AVX2__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ #include "kernel/kernel.h" #define KERNEL_ARCH cpu_avx2 diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp index b468b6f44c8..9743789179d 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp @@ -29,7 +29,7 @@ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) # define __KERNEL_SSE2__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ #include "kernel/kernel.h" #define KERNEL_ARCH cpu_sse2 diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp index 3e5792d0b17..1bec7633500 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp @@ -31,7 +31,7 @@ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ #include "kernel/kernel.h" #define KERNEL_ARCH cpu_sse3 diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp index 3629f21cd29..c0efc2350e9 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp @@ -32,7 +32,7 @@ # define __KERNEL_SSSE3__ # define __KERNEL_SSE41__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ #include "kernel/kernel.h" #define KERNEL_ARCH cpu_sse41 diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp index 57530c88710..173be8e93ce 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp @@ -27,7 +27,7 @@ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) # define __KERNEL_SSE2__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ #include "kernel/kernel.h" #define KERNEL_ARCH cpu_sse2 diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp index c607753bc4b..31273fe3344 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp @@ -29,7 +29,7 @@ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ #include "kernel/kernel.h" #define KERNEL_ARCH cpu_sse3 diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp index a278554731c..1d020b7fee6 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp @@ -30,7 +30,7 @@ # define __KERNEL_SSSE3__ # define __KERNEL_SSE41__ # endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ #include "kernel/kernel.h" #define KERNEL_ARCH cpu_sse41 diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h index 6d41dc15785..d9f349837a8 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_config.h +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -81,7 +81,6 @@ # define CUDA_KERNEL_MAX_REGISTERS 64 # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72 - /* unknown architecture */ #else # error "Unknown or unsupported CUDA architecture, can't determine launch bounds" @@ -96,18 +95,19 @@ * given the maximum number of registers per thread. */ #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ - __launch_bounds__( \ - threads_block_width*threads_block_width, \ - CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ - ) + __launch_bounds__(threads_block_width *threads_block_width, \ + CUDA_MULTIPRESSOR_MAX_REGISTERS / \ + (threads_block_width * threads_block_width * thread_num_registers)) /* sanity checks */ -#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS +#if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS # error "Maximum number of threads per block exceeded" #endif -#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS +#if CUDA_MULTIPRESSOR_MAX_REGISTERS / \ + (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \ + CUDA_MULTIPROCESSOR_MAX_BLOCKS # error "Maximum number of blocks per multiprocessor exceeded" #endif diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h index 37cfbbcb235..7c68f08ea10 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h +++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h @@ -17,174 +17,165 @@ /* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */ ccl_device float cubic_w0(float a) { - return (1.0f/6.0f)*(a*(a*(-a + 3.0f) - 3.0f) + 1.0f); + return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f); } ccl_device float cubic_w1(float a) { - return (1.0f/6.0f)*(a*a*(3.0f*a - 6.0f) + 4.0f); + return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f); } ccl_device float cubic_w2(float a) { - return (1.0f/6.0f)*(a*(a*(-3.0f*a + 3.0f) + 3.0f) + 1.0f); + return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f); } ccl_device float cubic_w3(float a) { - return (1.0f/6.0f)*(a*a*a); + return (1.0f / 6.0f) * (a * a * a); } /* g0 and g1 are the two amplitude functions. */ ccl_device float cubic_g0(float a) { - return cubic_w0(a) + cubic_w1(a); + return cubic_w0(a) + cubic_w1(a); } ccl_device float cubic_g1(float a) { - return cubic_w2(a) + cubic_w3(a); + return cubic_w2(a) + cubic_w3(a); } /* h0 and h1 are the two offset functions */ ccl_device float cubic_h0(float a) { - /* Note +0.5 offset to compensate for CUDA linear filtering convention. */ - return -1.0f + cubic_w1(a) / (cubic_w0(a) + cubic_w1(a)) + 0.5f; + /* Note +0.5 offset to compensate for CUDA linear filtering convention. */ + return -1.0f + cubic_w1(a) / (cubic_w0(a) + cubic_w1(a)) + 0.5f; } ccl_device float cubic_h1(float a) { - return 1.0f + cubic_w3(a) / (cubic_w2(a) + cubic_w3(a)) + 0.5f; + return 1.0f + cubic_w3(a) / (cubic_w2(a) + cubic_w3(a)) + 0.5f; } /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */ template -ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObject tex, float x, float y) +ccl_device T +kernel_tex_image_interp_bicubic(const TextureInfo &info, CUtexObject tex, float x, float y) { - x = (x * info.width) - 0.5f; - y = (y * info.height) - 0.5f; - - float px = floor(x); - float py = floor(y); - float fx = x - px; - float fy = y - py; - - float g0x = cubic_g0(fx); - float g1x = cubic_g1(fx); - float x0 = (px + cubic_h0(fx)) / info.width; - float x1 = (px + cubic_h1(fx)) / info.width; - float y0 = (py + cubic_h0(fy)) / info.height; - float y1 = (py + cubic_h1(fy)) / info.height; - - return cubic_g0(fy) * (g0x * tex2D(tex, x0, y0) + - g1x * tex2D(tex, x1, y0)) + - cubic_g1(fy) * (g0x * tex2D(tex, x0, y1) + - g1x * tex2D(tex, x1, y1)); + x = (x * info.width) - 0.5f; + y = (y * info.height) - 0.5f; + + float px = floor(x); + float py = floor(y); + float fx = x - px; + float fy = y - py; + + float g0x = cubic_g0(fx); + float g1x = cubic_g1(fx); + float x0 = (px + cubic_h0(fx)) / info.width; + float x1 = (px + cubic_h1(fx)) / info.width; + float y0 = (py + cubic_h0(fy)) / info.height; + float y1 = (py + cubic_h1(fy)) / info.height; + + return cubic_g0(fy) * (g0x * tex2D(tex, x0, y0) + g1x * tex2D(tex, x1, y0)) + + cubic_g1(fy) * (g0x * tex2D(tex, x0, y1) + g1x * tex2D(tex, x1, y1)); } /* Fast tricubic texture lookup using 8 trilinear lookups. */ template -ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo& info, CUtexObject tex, float x, float y, float z) +ccl_device T kernel_tex_image_interp_bicubic_3d( + const TextureInfo &info, CUtexObject tex, float x, float y, float z) { - x = (x * info.width) - 0.5f; - y = (y * info.height) - 0.5f; - z = (z * info.depth) - 0.5f; - - float px = floor(x); - float py = floor(y); - float pz = floor(z); - float fx = x - px; - float fy = y - py; - float fz = z - pz; - - float g0x = cubic_g0(fx); - float g1x = cubic_g1(fx); - float g0y = cubic_g0(fy); - float g1y = cubic_g1(fy); - float g0z = cubic_g0(fz); - float g1z = cubic_g1(fz); - - float x0 = (px + cubic_h0(fx)) / info.width; - float x1 = (px + cubic_h1(fx)) / info.width; - float y0 = (py + cubic_h0(fy)) / info.height; - float y1 = (py + cubic_h1(fy)) / info.height; - float z0 = (pz + cubic_h0(fz)) / info.depth; - float z1 = (pz + cubic_h1(fz)) / info.depth; - - return g0z * (g0y * (g0x * tex3D(tex, x0, y0, z0) + - g1x * tex3D(tex, x1, y0, z0)) + - g1y * (g0x * tex3D(tex, x0, y1, z0) + - g1x * tex3D(tex, x1, y1, z0))) + - g1z * (g0y * (g0x * tex3D(tex, x0, y0, z1) + - g1x * tex3D(tex, x1, y0, z1)) + - g1y * (g0x * tex3D(tex, x0, y1, z1) + - g1x * tex3D(tex, x1, y1, z1))); + x = (x * info.width) - 0.5f; + y = (y * info.height) - 0.5f; + z = (z * info.depth) - 0.5f; + + float px = floor(x); + float py = floor(y); + float pz = floor(z); + float fx = x - px; + float fy = y - py; + float fz = z - pz; + + float g0x = cubic_g0(fx); + float g1x = cubic_g1(fx); + float g0y = cubic_g0(fy); + float g1y = cubic_g1(fy); + float g0z = cubic_g0(fz); + float g1z = cubic_g1(fz); + + float x0 = (px + cubic_h0(fx)) / info.width; + float x1 = (px + cubic_h1(fx)) / info.width; + float y0 = (py + cubic_h0(fy)) / info.height; + float y1 = (py + cubic_h1(fy)) / info.height; + float z0 = (pz + cubic_h0(fz)) / info.depth; + float z1 = (pz + cubic_h1(fz)) / info.depth; + + return g0z * (g0y * (g0x * tex3D(tex, x0, y0, z0) + g1x * tex3D(tex, x1, y0, z0)) + + g1y * (g0x * tex3D(tex, x0, y1, z0) + g1x * tex3D(tex, x1, y1, z0))) + + g1z * (g0y * (g0x * tex3D(tex, x0, y0, z1) + g1x * tex3D(tex, x1, y0, z1)) + + g1y * (g0x * tex3D(tex, x0, y1, z1) + g1x * tex3D(tex, x1, y1, z1))); } ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { - const TextureInfo& info = kernel_tex_fetch(__texture_info, id); - CUtexObject tex = (CUtexObject)info.data; - - /* float4, byte4, ushort4 and half4 */ - const int texture_type = kernel_tex_type(id); - if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || - texture_type == IMAGE_DATA_TYPE_BYTE4 || - texture_type == IMAGE_DATA_TYPE_HALF4 || - texture_type == IMAGE_DATA_TYPE_USHORT4) - { - if(info.interpolation == INTERPOLATION_CUBIC) { - return kernel_tex_image_interp_bicubic(info, tex, x, y); - } - else { - return tex2D(tex, x, y); - } - } - /* float, byte and half */ - else { - float f; - - if(info.interpolation == INTERPOLATION_CUBIC) { - f = kernel_tex_image_interp_bicubic(info, tex, x, y); - } - else { - f = tex2D(tex, x, y); - } - - return make_float4(f, f, f, 1.0f); - } + const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + CUtexObject tex = (CUtexObject)info.data; + + /* float4, byte4, ushort4 and half4 */ + const int texture_type = kernel_tex_type(id); + if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) { + if (info.interpolation == INTERPOLATION_CUBIC) { + return kernel_tex_image_interp_bicubic(info, tex, x, y); + } + else { + return tex2D(tex, x, y); + } + } + /* float, byte and half */ + else { + float f; + + if (info.interpolation == INTERPOLATION_CUBIC) { + f = kernel_tex_image_interp_bicubic(info, tex, x, y); + } + else { + f = tex2D(tex, x, y); + } + + return make_float4(f, f, f, 1.0f); + } } -ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp) +ccl_device float4 kernel_tex_image_interp_3d( + KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp) { - const TextureInfo& info = kernel_tex_fetch(__texture_info, id); - CUtexObject tex = (CUtexObject)info.data; - uint interpolation = (interp == INTERPOLATION_NONE)? info.interpolation: interp; - - const int texture_type = kernel_tex_type(id); - if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || - texture_type == IMAGE_DATA_TYPE_BYTE4 || - texture_type == IMAGE_DATA_TYPE_HALF4 || - texture_type == IMAGE_DATA_TYPE_USHORT4) - { - if(interpolation == INTERPOLATION_CUBIC) { - return kernel_tex_image_interp_bicubic_3d(info, tex, x, y, z); - } - else { - return tex3D(tex, x, y, z); - } - } - else { - float f; - - if(interpolation == INTERPOLATION_CUBIC) { - f = kernel_tex_image_interp_bicubic_3d(info, tex, x, y, z); - } - else { - f = tex3D(tex, x, y, z); - } - - return make_float4(f, f, f, 1.0f); - } + const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + CUtexObject tex = (CUtexObject)info.data; + uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp; + + const int texture_type = kernel_tex_type(id); + if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) { + if (interpolation == INTERPOLATION_CUBIC) { + return kernel_tex_image_interp_bicubic_3d(info, tex, x, y, z); + } + else { + return tex3D(tex, x, y, z); + } + } + else { + float f; + + if (interpolation == INTERPOLATION_CUBIC) { + f = kernel_tex_image_interp_bicubic_3d(info, tex, x, y, z); + } + else { + f = tex3D(tex, x, y, z); + } + + return make_float4(f, f, f, 1.0f); + } } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h index 79af831c2fb..b6390679331 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h +++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h @@ -16,254 +16,257 @@ /* For OpenCL we do manual lookup and interpolation. */ -ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uint id) { - const uint tex_offset = id -#define KERNEL_TEX(type, name) + 1 +ccl_device_inline ccl_global TextureInfo *kernel_tex_info(KernelGlobals *kg, uint id) +{ + const uint tex_offset = id +#define KERNEL_TEX(type, name) +1 #include "kernel/kernel_textures.h" - ; + ; - return &((ccl_global TextureInfo*)kg->buffers[0])[tex_offset]; + return &((ccl_global TextureInfo *)kg->buffers[0])[tex_offset]; } -#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)] +#define tex_fetch(type, info, index) \ + ((ccl_global type *)(kg->buffers[info->cl_buffer] + info->data))[(index)] ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) { - x %= width; - if(x < 0) - x += width; - return x; + x %= width; + if (x < 0) + x += width; + return x; } ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) { - return clamp(x, 0, width-1); + return clamp(x, 0, width - 1); } -ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, const ccl_global TextureInfo *info, int id, int offset) +ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, + const ccl_global TextureInfo *info, + int id, + int offset) { - const int texture_type = kernel_tex_type(id); - - /* Float4 */ - if(texture_type == IMAGE_DATA_TYPE_FLOAT4) { - return tex_fetch(float4, info, offset); - } - /* Byte4 */ - else if(texture_type == IMAGE_DATA_TYPE_BYTE4) { - uchar4 r = tex_fetch(uchar4, info, offset); - float f = 1.0f/255.0f; - return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); - } - /* Ushort4 */ - else if(texture_type == IMAGE_DATA_TYPE_USHORT4) { - ushort4 r = tex_fetch(ushort4, info, offset); - float f = 1.0f/65535.f; - return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); - } - /* Float */ - else if(texture_type == IMAGE_DATA_TYPE_FLOAT) { - float f = tex_fetch(float, info, offset); - return make_float4(f, f, f, 1.0f); - } - /* UShort */ - else if(texture_type == IMAGE_DATA_TYPE_USHORT) { - ushort r = tex_fetch(ushort, info, offset); - float f = r * (1.0f / 65535.0f); - return make_float4(f, f, f, 1.0f); - } - /* Byte */ + const int texture_type = kernel_tex_type(id); + + /* Float4 */ + if (texture_type == IMAGE_DATA_TYPE_FLOAT4) { + return tex_fetch(float4, info, offset); + } + /* Byte4 */ + else if (texture_type == IMAGE_DATA_TYPE_BYTE4) { + uchar4 r = tex_fetch(uchar4, info, offset); + float f = 1.0f / 255.0f; + return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); + } + /* Ushort4 */ + else if (texture_type == IMAGE_DATA_TYPE_USHORT4) { + ushort4 r = tex_fetch(ushort4, info, offset); + float f = 1.0f / 65535.f; + return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); + } + /* Float */ + else if (texture_type == IMAGE_DATA_TYPE_FLOAT) { + float f = tex_fetch(float, info, offset); + return make_float4(f, f, f, 1.0f); + } + /* UShort */ + else if (texture_type == IMAGE_DATA_TYPE_USHORT) { + ushort r = tex_fetch(ushort, info, offset); + float f = r * (1.0f / 65535.0f); + return make_float4(f, f, f, 1.0f); + } + /* Byte */ #ifdef cl_khr_fp16 - /* half and half4 are optional in OpenCL */ - else if(texture_type == IMAGE_DATA_TYPE_HALF) { - float f = tex_fetch(half, info, offset); - return make_float4(f, f, f, 1.0f); - } - else if(texture_type == IMAGE_DATA_TYPE_HALF4) { - half4 r = tex_fetch(half4, info, offset); - return make_float4(r.x, r.y, r.z, r.w); - } + /* half and half4 are optional in OpenCL */ + else if (texture_type == IMAGE_DATA_TYPE_HALF) { + float f = tex_fetch(half, info, offset); + return make_float4(f, f, f, 1.0f); + } + else if (texture_type == IMAGE_DATA_TYPE_HALF4) { + half4 r = tex_fetch(half4, info, offset); + return make_float4(r.x, r.y, r.z, r.w); + } #endif - else { - uchar r = tex_fetch(uchar, info, offset); - float f = r * (1.0f/255.0f); - return make_float4(f, f, f, 1.0f); - } + else { + uchar r = tex_fetch(uchar, info, offset); + float f = r * (1.0f / 255.0f); + return make_float4(f, f, f, 1.0f); + } } ccl_device_inline float4 svm_image_texture_read_2d(KernelGlobals *kg, int id, int x, int y) { - const ccl_global TextureInfo *info = kernel_tex_info(kg, id); - - /* Wrap */ - if(info->extension == EXTENSION_REPEAT) { - x = svm_image_texture_wrap_periodic(x, info->width); - y = svm_image_texture_wrap_periodic(y, info->height); - } - else { - x = svm_image_texture_wrap_clamp(x, info->width); - y = svm_image_texture_wrap_clamp(y, info->height); - } - - int offset = x + info->width * y; - return svm_image_texture_read(kg, info, id, offset); + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + /* Wrap */ + if (info->extension == EXTENSION_REPEAT) { + x = svm_image_texture_wrap_periodic(x, info->width); + y = svm_image_texture_wrap_periodic(y, info->height); + } + else { + x = svm_image_texture_wrap_clamp(x, info->width); + y = svm_image_texture_wrap_clamp(y, info->height); + } + + int offset = x + info->width * y; + return svm_image_texture_read(kg, info, id, offset); } ccl_device_inline float4 svm_image_texture_read_3d(KernelGlobals *kg, int id, int x, int y, int z) { - const ccl_global TextureInfo *info = kernel_tex_info(kg, id); - - /* Wrap */ - if(info->extension == EXTENSION_REPEAT) { - x = svm_image_texture_wrap_periodic(x, info->width); - y = svm_image_texture_wrap_periodic(y, info->height); - z = svm_image_texture_wrap_periodic(z, info->depth); - } - else { - x = svm_image_texture_wrap_clamp(x, info->width); - y = svm_image_texture_wrap_clamp(y, info->height); - z = svm_image_texture_wrap_clamp(z, info->depth); - } - - int offset = x + info->width * y + info->width * info->height * z; - return svm_image_texture_read(kg, info, id, offset); + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + /* Wrap */ + if (info->extension == EXTENSION_REPEAT) { + x = svm_image_texture_wrap_periodic(x, info->width); + y = svm_image_texture_wrap_periodic(y, info->height); + z = svm_image_texture_wrap_periodic(z, info->depth); + } + else { + x = svm_image_texture_wrap_clamp(x, info->width); + y = svm_image_texture_wrap_clamp(y, info->height); + z = svm_image_texture_wrap_clamp(z, info->depth); + } + + int offset = x + info->width * y + info->width * info->height * z; + return svm_image_texture_read(kg, info, id, offset); } - ccl_device_inline float svm_image_texture_frac(float x, int *ix) { - int i = float_to_int(x) - ((x < 0.0f)? 1: 0); - *ix = i; - return x - (float)i; + int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0); + *ix = i; + return x - (float)i; } #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ - { \ - u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ - u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ - u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ - u[3] = (1.0f / 6.0f) * t * t * t; \ - } (void) 0 + { \ + u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \ + u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \ + u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } \ + (void)0 ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { - const ccl_global TextureInfo *info = kernel_tex_info(kg, id); - - if(info->extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - - if(info->interpolation == INTERPOLATION_CLOSEST) { - /* Closest interpolation. */ - int ix, iy; - svm_image_texture_frac(x*info->width, &ix); - svm_image_texture_frac(y*info->height, &iy); - - return svm_image_texture_read_2d(kg, id, ix, iy); - } - else if(info->interpolation == INTERPOLATION_LINEAR) { - /* Bilinear interpolation. */ - int ix, iy; - float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); - - float4 r; - r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy); - r += (1.0f - ty)*tx*svm_image_texture_read_2d(kg, id, ix+1, iy); - r += ty*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy+1); - r += ty*tx*svm_image_texture_read_2d(kg, id, ix+1, iy+1); - return r; - } - else { - /* Bicubic interpolation. */ - int ix, iy; - float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); - - float u[4], v[4]; - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - - float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - for(int y = 0; y < 4; y++) { - for(int x = 0; x < 4; x++) { - float weight = u[x]*v[y]; - r += weight*svm_image_texture_read_2d(kg, id, ix+x-1, iy+y-1); - } - } - return r; - } + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + if (info->extension == EXTENSION_CLIP) { + if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + + if (info->interpolation == INTERPOLATION_CLOSEST) { + /* Closest interpolation. */ + int ix, iy; + svm_image_texture_frac(x * info->width, &ix); + svm_image_texture_frac(y * info->height, &iy); + + return svm_image_texture_read_2d(kg, id, ix, iy); + } + else if (info->interpolation == INTERPOLATION_LINEAR) { + /* Bilinear interpolation. */ + int ix, iy; + float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy); + + float4 r; + r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(kg, id, ix, iy); + r += (1.0f - ty) * tx * svm_image_texture_read_2d(kg, id, ix + 1, iy); + r += ty * (1.0f - tx) * svm_image_texture_read_2d(kg, id, ix, iy + 1); + r += ty * tx * svm_image_texture_read_2d(kg, id, ix + 1, iy + 1); + return r; + } + else { + /* Bicubic interpolation. */ + int ix, iy; + float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy); + + float u[4], v[4]; + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + float weight = u[x] * v[y]; + r += weight * svm_image_texture_read_2d(kg, id, ix + x - 1, iy + y - 1); + } + } + return r; + } } - -ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, int interp) +ccl_device float4 +kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, int interp) { - const ccl_global TextureInfo *info = kernel_tex_info(kg, id); - - if(info->extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - - uint interpolation = (interp == INTERPOLATION_NONE)? info->interpolation: interp; - - if(interpolation == INTERPOLATION_CLOSEST) { - /* Closest interpolation. */ - int ix, iy, iz; - svm_image_texture_frac(x*info->width, &ix); - svm_image_texture_frac(y*info->height, &iy); - svm_image_texture_frac(z*info->depth, &iz); - - return svm_image_texture_read_3d(kg, id, ix, iy, iz); - } - else if(interpolation == INTERPOLATION_LINEAR) { - /* Bilinear interpolation. */ - int ix, iy, iz; - float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); - float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz); - - float4 r; - r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz); - r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz); - r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz); - r += (1.0f - tz)*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz); - - r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz+1); - r += tz*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz+1); - r += tz*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz+1); - r += tz*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz+1); - return r; - } - else { - /* Bicubic interpolation. */ - int ix, iy, iz; - float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); - float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz); - - float u[4], v[4], w[4]; - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - SET_CUBIC_SPLINE_WEIGHTS(w, tz); - - float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - for(int z = 0; z < 4; z++) { - for(int y = 0; y < 4; y++) { - for(int x = 0; x < 4; x++) { - float weight = u[x]*v[y]*w[z]; - r += weight*svm_image_texture_read_3d(kg, id, ix+x-1, iy+y-1, iz+z-1); - } - } - } - return r; - } + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + if (info->extension == EXTENSION_CLIP) { + if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + + uint interpolation = (interp == INTERPOLATION_NONE) ? info->interpolation : interp; + + if (interpolation == INTERPOLATION_CLOSEST) { + /* Closest interpolation. */ + int ix, iy, iz; + svm_image_texture_frac(x * info->width, &ix); + svm_image_texture_frac(y * info->height, &iy); + svm_image_texture_frac(z * info->depth, &iz); + + return svm_image_texture_read_3d(kg, id, ix, iy, iz); + } + else if (interpolation == INTERPOLATION_LINEAR) { + /* Bilinear interpolation. */ + int ix, iy, iz; + float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy); + float tz = svm_image_texture_frac(z * info->depth - 0.5f, &iz); + + float4 r; + r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(kg, id, ix, iy, iz); + r += (1.0f - tz) * (1.0f - ty) * tx * svm_image_texture_read_3d(kg, id, ix + 1, iy, iz); + r += (1.0f - tz) * ty * (1.0f - tx) * svm_image_texture_read_3d(kg, id, ix, iy + 1, iz); + r += (1.0f - tz) * ty * tx * svm_image_texture_read_3d(kg, id, ix + 1, iy + 1, iz); + + r += tz * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(kg, id, ix, iy, iz + 1); + r += tz * (1.0f - ty) * tx * svm_image_texture_read_3d(kg, id, ix + 1, iy, iz + 1); + r += tz * ty * (1.0f - tx) * svm_image_texture_read_3d(kg, id, ix, iy + 1, iz + 1); + r += tz * ty * tx * svm_image_texture_read_3d(kg, id, ix + 1, iy + 1, iz + 1); + return r; + } + else { + /* Bicubic interpolation. */ + int ix, iy, iz; + float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy); + float tz = svm_image_texture_frac(z * info->depth - 0.5f, &iz); + + float u[4], v[4], w[4]; + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + for (int z = 0; z < 4; z++) { + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + float weight = u[x] * v[y] * w[z]; + r += weight * svm_image_texture_read_3d(kg, id, ix + x - 1, iy + y - 1, iz + z - 1); + } + } + } + return r; + } } #undef SET_CUBIC_SPLINE_WEIGHTS diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h index 05e1ddf6da2..e123b4cd6ec 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h +++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h @@ -14,50 +14,53 @@ * limitations under the License. */ -#define KERNEL_NAME_JOIN(a, b) a ## _ ## b +#define KERNEL_NAME_JOIN(a, b) a##_##b #define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b) -__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( - ccl_global char *kg_global, - ccl_constant KernelData *data, +__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, + KERNEL_NAME)(ccl_global char *kg_global, + ccl_constant KernelData *data, - ccl_global void *split_data_buffer, - ccl_global char *ray_state, + ccl_global void *split_data_buffer, + ccl_global char *ray_state, - KERNEL_BUFFER_PARAMS, + KERNEL_BUFFER_PARAMS, - ccl_global int *queue_index, - ccl_global char *use_queues_flag, - ccl_global unsigned int *work_pools, - ccl_global float *buffer - ) + ccl_global int *queue_index, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pools, + ccl_global float *buffer) { #ifdef LOCALS_TYPE - ccl_local LOCALS_TYPE locals; + ccl_local LOCALS_TYPE locals; #endif - KernelGlobals *kg = (KernelGlobals*)kg_global; + KernelGlobals *kg = (KernelGlobals *)kg_global; - if(ccl_local_id(0) + ccl_local_id(1) == 0) { - kg->data = data; + if (ccl_local_id(0) + ccl_local_id(1) == 0) { + kg->data = data; - kernel_split_params.queue_index = queue_index; - kernel_split_params.use_queues_flag = use_queues_flag; - kernel_split_params.work_pools = work_pools; - kernel_split_params.tile.buffer = buffer; + kernel_split_params.queue_index = queue_index; + kernel_split_params.use_queues_flag = use_queues_flag; + kernel_split_params.work_pools = work_pools; + kernel_split_params.tile.buffer = buffer; - split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state); + split_data_init(kg, + &kernel_split_state, + ccl_global_size(0) * ccl_global_size(1), + split_data_buffer, + ray_state); + } - } + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - - KERNEL_NAME_EVAL(kernel, KERNEL_NAME)( - kg + KERNEL_NAME_EVAL(kernel, KERNEL_NAME) + (kg #ifdef LOCALS_TYPE - , &locals + , + &locals #endif - ); + ); } #undef KERNEL_NAME_JOIN -- cgit v1.2.3