diff options
author | Bastien Montagne <montagne29@wanadoo.fr> | 2017-08-07 17:16:43 +0300 |
---|---|---|
committer | Bastien Montagne <montagne29@wanadoo.fr> | 2017-08-07 17:16:43 +0300 |
commit | b282716c3aae68d5fc781e7fc5df0241c3d91198 (patch) | |
tree | b6c50064fe3a0c07341cd70cf9745ea3092c8fb7 | |
parent | 459365443f62d2f8e8718c1d1b0fbaafd6d765de (diff) | |
parent | 580741b317ae60eb3bf999d636da0325c7e67373 (diff) |
Merge branch 'master' into blender2.8
44 files changed, 599 insertions, 497 deletions
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py index 440221b8470..17efb00abdb 100644 --- a/intern/cycles/blender/addon/presets.py +++ b/intern/cycles/blender/addon/presets.py @@ -37,7 +37,6 @@ class AddPresetIntegrator(AddPresetBase, Operator): "cycles.transmission_bounces", "cycles.volume_bounces", "cycles.transparent_max_bounces", - "cycles.use_transparent_shadows", "cycles.caustics_reflective", "cycles.caustics_refractive", "cycles.blur_glossy" diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index cfffe5362ca..93b90ec650b 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -343,11 +343,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): min=0, max=1024, default=8, ) - cls.use_transparent_shadows = BoolProperty( - name="Transparent Shadows", - description="Use transparency of surfaces for rendering shadows", - default=True, - ) cls.volume_step_size = FloatProperty( name="Step Size", diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 59ee053efb9..983f817539b 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -293,7 +293,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): sub = col.column(align=True) sub.label("Transparency:") sub.prop(cscene, "transparent_max_bounces", text="Max") - sub.prop(cscene, "use_transparent_shadows", text="Shadows") col.separator() diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 15ad4ff301c..adbabaccdc1 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -245,7 +245,6 @@ void BlenderSync::sync_integrator() integrator->max_volume_bounce = get_int(cscene, "volume_bounces"); integrator->transparent_max_bounce = get_int(cscene, "transparent_max_bounces"); - integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows"); integrator->volume_max_steps = get_int(cscene, "volume_max_steps"); integrator->volume_step_size = get_float(cscene, "volume_step_size"); diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 18112437b45..a00be3eeaab 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -48,6 +48,7 @@ #include "util/util_logging.h" #include "util/util_map.h" #include "util/util_opengl.h" +#include "util/util_optimization.h" #include "util/util_progress.h" #include "util/util_system.h" #include "util/util_thread.h" diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index e53aec0fbb9..f13506c8960 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1919,17 +1919,13 @@ public: int threads_per_block; cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - - int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads; - int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads; + int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block; cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); cuda_assert(cuLaunchKernel(func, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ + xblocks, 1, 1, /* blocks */ + threads_per_block, 1, 1, /* threads */ 0, 0, args, 0)); device->cuda_pop_context(); diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index 0d34af3e040..7d5173a5f1d 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -635,7 +635,7 @@ bool OpenCLInfo::device_supported(const string& platform_name, "Tahiti", "Pitcairn", "Capeverde", "Oland", NULL }; - for (int i = 0; blacklist[i] != NULL; i++) { + for(int i = 0; blacklist[i] != NULL; i++) { if(device_name == blacklist[i]) { VLOG(1) << "AMD device " << device_name << " not supported"; return false; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 23e9bd311c4..88c4c4e3282 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -233,6 +233,7 @@ set(SRC_FILTER_HEADERS set(SRC_UTIL_HEADERS ../util/util_atomic.h ../util/util_color.h + ../util/util_defines.h ../util/util_half.h ../util/util_hash.h ../util/util_math.h diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index 3185330994c..3ddd8712266 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) +#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride) /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. * pixel_buffer always points to the first of the 4 current pixel in the first pass. @@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN #define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - __m128 y4 = _mm_set1_ps(pixel.y); \ + float4 y4 = make_float4(pixel.y); \ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ - __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); #define END_FOR_PIXEL_WINDOW_SSE } \ pixel_buffer += buffer_w - (pixel.x - low.x); \ } -ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, - __m128 active_pixels, +ccl_device_inline void filter_get_features_sse(float4 x, float4 y, + int4 active_pixels, const float *ccl_restrict buffer, - __m128 *features, - const __m128 *ccl_restrict mean, + float4 *features, + const float4 *ccl_restrict mean, int pass_stride) { features[0] = x; features[1] = y; - features[2] = _mm_fabs_ps(ccl_get_feature_sse(0)); + features[2] = fabs(ccl_get_feature_sse(0)); features[3] = ccl_get_feature_sse(1); features[4] = ccl_get_feature_sse(2); features[5] = ccl_get_feature_sse(3); @@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, features[9] = ccl_get_feature_sse(7); if(mean) { for(int i = 0; i < DENOISE_FEATURES; i++) - features[i] = _mm_sub_ps(features[i], mean[i]); + features[i] = features[i] - mean[i]; } for(int i = 0; i < DENOISE_FEATURES; i++) - features[i] = _mm_mask_ps(features[i], active_pixels); + features[i] = mask(active_pixels, features[i]); } -ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, - __m128 active_pixels, +ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, + int4 active_pixels, const float *ccl_restrict buffer, - __m128 *scales, - const __m128 *ccl_restrict mean, + float4 *scales, + const float4 *ccl_restrict mean, int pass_stride) { - scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); - scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); - - scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels); - - __m128 diff, scale; - diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); - scale = _mm_mul_ps(diff, diff); - diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - scales[3] = _mm_mask_ps(scale, active_pixels); - - scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); - - diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); - scale = _mm_mul_ps(diff, diff); - diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - scales[5] = _mm_mask_ps(scale, active_pixels); + scales[0] = fabs(x - mean[0]); + scales[1] = fabs(y - mean[1]); + scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); + scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + + sqr(ccl_get_feature_sse(2) - mean[4]) + + sqr(ccl_get_feature_sse(3) - mean[5]); + scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); + scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + + sqr(ccl_get_feature_sse(6) - mean[8]) + + sqr(ccl_get_feature_sse(7) - mean[9]); + for(int i = 0; i < 6; i++) + scales[i] = mask(active_pixels, scales[i]); } -ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) +ccl_device_inline void filter_calculate_scale_sse(float4 *scale) { - scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); - scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); - scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); - scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); - - scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); - scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); + scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); + scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); + scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); + scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); + scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); } diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 3e752bce68f..5e989331bc2 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen int w, int f) { -#ifdef __KERNEL_SSE3__ - int aligned_lowx = (rect.x & ~(3)); - int aligned_highx = ((rect.z + 3) & ~(3)); -#endif + int aligned_lowx = rect.x / 4; + int aligned_highx = (rect.z + 3) / 4; for(int y = rect.y; y < rect.w; y++) { const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); @@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen out_image[y*w+x] = 0.0f; } for(int y1 = low; y1 < high; y1++) { -#ifdef __KERNEL_SSE3__ - for(int x = aligned_lowx; x < aligned_highx; x+=4) { - _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x))); + float4* out_image4 = (float4*)(out_image + y*w); + float4* difference_image4 = (float4*)(difference_image + y1*w); + for(int x = aligned_lowx; x < aligned_highx; x++) { + out_image4[x] += difference_image4[x]; } -#else - for(int x = rect.x; x < rect.z; x++) { - out_image[y*w+x] += difference_image[y1*w+x]; - } -#endif } for(int x = rect.x; x < rect.z; x++) { out_image[y*w+x] *= 1.0f/(high - low); diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index a0b89c1111f..c6a70cbeab5 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -96,7 +96,7 @@ ccl_device void kernel_filter_get_feature(int sample, int idx = (y-rect.y)*buffer_w + (x - rect.x); mean[idx] = center_buffer[m_offset] / sample; - if (sample > 1) { + if(sample > 1) { if(use_split_variance) { variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); } diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index 30dc2969b11..9e65f61664b 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff { int buffer_w = align_up(rect.z - rect.x, 4); - __m128 features[DENOISE_FEATURES]; + float4 features[DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; int2 pixel; @@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff min(rect.w, y + radius + 1)); int num_pixels = (high.y - low.y) * (high.x - low.x); - __m128 feature_means[DENOISE_FEATURES]; + float4 feature_means[DENOISE_FEATURES]; math_vector_zero_sse(feature_means, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); math_vector_add_sse(feature_means, DENOISE_FEATURES, features); } END_FOR_PIXEL_WINDOW_SSE - __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels); + float4 pixel_scale = make_float4(1.0f / num_pixels); for(int i = 0; i < DENOISE_FEATURES; i++) { - feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale); + feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; } - __m128 feature_scale[DENOISE_FEATURES]; + float4 feature_scale[DENOISE_FEATURES]; math_vector_zero_sse(feature_scale, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); @@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff filter_calculate_scale_sse(feature_scale); - __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f)); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f)); } END_FOR_PIXEL_WINDOW_SSE float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; @@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff /* Bake the feature scaling into the transformation matrix. */ for(int i = 0; i < DENOISE_FEATURES; i++) { - math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank); + math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank); } } diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index 6ecdfe0173a..1ffc143be34 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -415,12 +415,7 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir) ccl_device_inline float3 bvh_inverse_direction(float3 dir) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 return rcp(dir); -#else - return 1.0f / dir; -#endif } /* Transform ray into object space to enter static object in BVH */ diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index c340b3bc968..8f6c2b07381 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -100,6 +100,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, #ifndef __SPLIT_KERNEL__ +#if defined(__BRANCHED_PATH__) || defined(__BAKING__) + ccl_device void kernel_path_indirect(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, @@ -428,6 +430,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } } +#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, RNG *rng, diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index a96ffe07718..3ce183bf67a 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -173,7 +173,7 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_ } #ifdef __SHADOW_TRICKS__ /* Exception for shadow catcher not working correctly with RR. */ - else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) { + else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) { return 1.0f; } #endif @@ -196,7 +196,7 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_ } #ifdef __SHADOW_TRICKS__ /* Exception for shadow catcher not working correctly with RR. */ - else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) { + else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) { return 1.0f; } #endif diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp index f7c9935f1d0..a13fb5cd4fb 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp @@ -25,6 +25,7 @@ #else /* SSE optimization disabled for now on 32 bit, see bug #36316 */ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ # endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp index 070b95a3505..6b690adf0f5 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp @@ -25,6 +25,7 @@ #else /* SSE optimization disabled for now on 32 bit, see bug #36316 */ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp index 1a7b2040da1..254025be4e2 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -25,6 +25,7 @@ #else /* SSE optimization disabled for now on 32 bit, see bug #36316 */ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h index 9fa39dc9ebb..7ae205b7e14 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_config.h +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -81,8 +81,13 @@ # error "Unknown or unsupported CUDA architecture, can't determine launch bounds" #endif -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ +/* For split kernel using all registers seems fastest for now, but this + * is unlikely to be optimal once we resolve other bottlenecks. */ + +#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ __launch_bounds__( \ diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index 628891b1458..e97e87285a5 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init( #define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ kernel_cuda_##name() \ { \ kernel_##name(NULL); \ @@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init( #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ kernel_cuda_##name() \ { \ ccl_local type locals; \ diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h index 297decb0bc2..5a55b680695 100644 --- a/intern/cycles/kernel/split/kernel_shader_sort.h +++ b/intern/cycles/kernel/split/kernel_shader_sort.h @@ -39,7 +39,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, ccl_local ushort *local_index = &locals->local_index[0]; /* copy to local memory */ - for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { uint idx = offset + i + lid; uint add = input + idx; uint value = (~0); @@ -59,9 +59,9 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, # ifdef __KERNEL_OPENCL__ /* bitonic sort */ - for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { - for (uint inc = length; inc > 0; inc >>= 1) { - for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { + for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { + for(uint inc = length; inc > 0; inc >>= 1) { + for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { uint i = lid + ii; bool direction = ((i & (length << 1)) != 0); uint j = i ^ inc; @@ -81,7 +81,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, # endif /* __KERNEL_OPENCL__ */ /* copy to destination */ - for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { uint idx = offset + i + lid; uint lidx = local_index[i + lid]; uint outi = output + idx; diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index 02b65440154..a490f10aee4 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -344,7 +344,7 @@ int ImageManager::add_image(const string& filename, else { /* Very unlikely, since max_num_images is insanely big. But better safe than sorry. */ int tex_count = 0; - for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { + for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { tex_count += tex_num_images[type]; } if(tex_count > max_num_images) { diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index b9b8c681a26..15b728d6e02 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -39,7 +39,6 @@ NODE_DEFINE(Integrator) SOCKET_INT(max_volume_bounce, "Max Volume Bounce", 7); SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7); - SOCKET_BOOLEAN(transparent_shadows, "Transparent Shadows", false); SOCKET_INT(ao_bounces, "AO Bounces", 0); @@ -121,19 +120,14 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene * We only need to enable transparent shadows, if we actually have * transparent shaders in the scene. Otherwise we can disable it * to improve performance a bit. */ - if(transparent_shadows) { - kintegrator->transparent_shadows = false; - foreach(Shader *shader, scene->shaders) { - /* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */ - if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) { - kintegrator->transparent_shadows = true; - break; - } + kintegrator->transparent_shadows = false; + foreach(Shader *shader, scene->shaders) { + /* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */ + if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) { + kintegrator->transparent_shadows = true; + break; } } - else { - kintegrator->transparent_shadows = false; - } kintegrator->volume_max_steps = volume_max_steps; kintegrator->volume_step_size = volume_step_size; diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index ce5651ec823..3cb430d72b4 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -39,7 +39,6 @@ public: int max_volume_bounce; int transparent_max_bounce; - bool transparent_shadows; int ao_bounces; diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index 93d88c5642c..371ea54ef11 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -225,7 +225,7 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene) bool LightManager::object_usable_as_light(Object *object) { Mesh *mesh = object->mesh; /* Skip objects with NaNs */ - if (!object->bounds.valid()) { + if(!object->bounds.valid()) { return false; } /* Skip if we are not visible for BSDFs. */ diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 8622318858e..ca3aefcb5e6 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -721,7 +721,6 @@ DeviceRequestedFeatures Session::get_requested_device_features() BakeManager *bake_manager = scene->bake_manager; requested_features.use_baking = bake_manager->get_baking(); requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH); - requested_features.use_transparent &= scene->integrator->transparent_shadows; requested_features.use_denoising = params.use_denoising; return requested_features; diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp index 44a266dfe18..493e01de363 100644 --- a/intern/cycles/render/shader.cpp +++ b/intern/cycles/render/shader.cpp @@ -503,9 +503,7 @@ void ShaderManager::device_update_common(Device *device, KernelIntegrator *kintegrator = &dscene->data.integrator; kintegrator->use_volumes = has_volumes; /* TODO(sergey): De-duplicate with flags set in integrator.cpp. */ - if(scene->integrator->transparent_shadows) { - kintegrator->transparent_shadows = has_transparent_shadow; - } + kintegrator->transparent_shadows = has_transparent_shadow; } void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scene *scene) diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 43f9a57d099..7f3747a0f58 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -38,6 +38,7 @@ set(SRC_HEADERS util_atomic.h util_boundbox.h util_debug.h + util_defines.h util_guarded_allocator.cpp util_foreach.h util_function.h diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h new file mode 100644 index 00000000000..d0d87e74332 --- /dev/null +++ b/intern/cycles/util/util_defines.h @@ -0,0 +1,134 @@ + +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_DEFINES_H__ +#define __UTIL_DEFINES_H__ + +/* Bitness */ + +#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) +# define __KERNEL_64_BIT__ +#endif + +/* Qualifiers for kernel code shared by CPU and GPU */ + +#ifndef __KERNEL_GPU__ +# define ccl_device static inline +# define ccl_device_noinline static +# define ccl_global +# define ccl_constant +# define ccl_local +# define ccl_local_param +# define ccl_private +# define ccl_restrict __restrict +# define __KERNEL_WITH_SSE_ALIGN__ + +# if defined(_WIN32) && !defined(FREE_WINDOWS) +# define ccl_device_inline static __forceinline +# define ccl_device_forceinline static __forceinline +# define ccl_align(...) __declspec(align(__VA_ARGS__)) +# ifdef __KERNEL_64_BIT__ +# define ccl_try_align(...) __declspec(align(__VA_ARGS__)) +# else /* __KERNEL_64_BIT__ */ +# undef __KERNEL_WITH_SSE_ALIGN__ +/* No support for function arguments (error C2719). */ +# define ccl_try_align(...) +# endif /* __KERNEL_64_BIT__ */ +# define ccl_may_alias +# define ccl_always_inline __forceinline +# define ccl_never_inline __declspec(noinline) +# define ccl_maybe_unused +# else /* _WIN32 && !FREE_WINDOWS */ +# define ccl_device_inline static inline __attribute__((always_inline)) +# define ccl_device_forceinline static inline __attribute__((always_inline)) +# define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) +# ifndef FREE_WINDOWS64 +# define __forceinline inline __attribute__((always_inline)) +# endif +# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) +# define ccl_may_alias __attribute__((__may_alias__)) +# define ccl_always_inline __attribute__((always_inline)) +# define ccl_never_inline __attribute__((noinline)) +# define ccl_maybe_unused __attribute__((used)) +# endif /* _WIN32 && !FREE_WINDOWS */ + +/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ +# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ +# define ATTR_FALLTHROUGH __attribute__((fallthrough)) +# else +# define ATTR_FALLTHROUGH ((void)0) +# endif +#endif /* __KERNEL_GPU__ */ + +/* macros */ + +/* hints for branch prediction, only use in code that runs a _lot_ */ +#if defined(__GNUC__) && defined(__KERNEL_CPU__) +# define LIKELY(x) __builtin_expect(!!(x), 1) +# define UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +# define LIKELY(x) (x) +# define UNLIKELY(x) (x) +#endif + +#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800)) +# define HAS_CPP11_FEATURES +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(HAS_CPP11_FEATURES) +/* Some magic to be sure we don't have reference in the type. */ +template<typename T> static inline T decltype_helper(T x) { return x; } +# define TYPEOF(x) decltype(decltype_helper(x)) +# else +# define TYPEOF(x) typeof(x) +# endif +#endif + +/* Causes warning: + * incompatible types when assigning to type 'Foo' from type 'Bar' + * ... the compiler optimizes away the temp var */ +#ifdef __GNUC__ +#define CHECK_TYPE(var, type) { \ + TYPEOF(var) *__tmp; \ + __tmp = (type *)NULL; \ + (void)__tmp; \ +} (void)0 + +#define CHECK_TYPE_PAIR(var_a, var_b) { \ + TYPEOF(var_a) *__tmp; \ + __tmp = (typeof(var_b) *)NULL; \ + (void)__tmp; \ +} (void)0 +#else +# define CHECK_TYPE(var, type) +# define CHECK_TYPE_PAIR(var_a, var_b) +#endif + +/* can be used in simple macros */ +#define CHECK_TYPE_INLINE(val, type) \ + ((void)(((type)0) != (val))) + +#ifndef __KERNEL_GPU__ +# include <cassert> +# define util_assert(statement) assert(statement) +#else +# define util_assert(statement) +#endif + +#endif /* __UTIL_DEFINES_H__ */ + diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index b719640b19c..4d51ec5570a 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -94,6 +94,7 @@ ccl_device_inline float fminf(float a, float b) #ifndef __KERNEL_GPU__ using std::isfinite; using std::isnan; +using std::sqrt; ccl_device_inline int abs(int x) { diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h index bb04c4aa2d9..e73e5bc17a2 100644 --- a/intern/cycles/util/util_math_float3.h +++ b/intern/cycles/util/util_math_float3.h @@ -108,8 +108,7 @@ ccl_device_inline float3 operator*(const float3& a, const float f) ccl_device_inline float3 operator*(const float f, const float3& a) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 +#if defined(__KERNEL_SSE__) return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); #else return make_float3(a.x*f, a.y*f, a.z*f); @@ -118,10 +117,8 @@ ccl_device_inline float3 operator*(const float f, const float3& a) ccl_device_inline float3 operator/(const float f, const float3& a) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(a.m128); - return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); +#if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(_mm_set1_ps(f), a.m128)); #else return make_float3(f / a.x, f / a.y, f / a.z); #endif @@ -135,10 +132,8 @@ ccl_device_inline float3 operator/(const float3& a, const float f) ccl_device_inline float3 operator/(const float3& a, const float3& b) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(b.m128); - return float3(_mm_mul_ps(a, rc)); +#if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(a.m128, b.m128)); #else return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); #endif @@ -282,9 +277,8 @@ ccl_device_inline float3 mix(const float3& a, const float3& b, float t) ccl_device_inline float3 rcp(const float3& a) { #ifdef __KERNEL_SSE__ - const float4 r(_mm_rcp_ps(a.m128)); - return float3(_mm_sub_ps(_mm_add_ps(r, r), - _mm_mul_ps(_mm_mul_ps(r, r), a))); + /* Don't use _mm_rcp_ps due to poor precision. */ + return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); #else return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); #endif diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index d89121b3a1d..adb9a76a434 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -48,23 +48,30 @@ ccl_device_inline bool operator==(const float4& a, const float4& b); ccl_device_inline float dot(const float4& a, const float4& b); ccl_device_inline float len_squared(const float4& a); ccl_device_inline float4 rcp(const float4& a); +ccl_device_inline float4 sqrt(const float4& a); +ccl_device_inline float4 sqr(const float4& a); ccl_device_inline float4 cross(const float4& a, const float4& b); ccl_device_inline bool is_zero(const float4& a); -ccl_device_inline float reduce_add(const float4& a); ccl_device_inline float average(const float4& a); ccl_device_inline float len(const float4& a); ccl_device_inline float4 normalize(const float4& a); ccl_device_inline float4 safe_normalize(const float4& a); ccl_device_inline float4 min(const float4& a, const float4& b); ccl_device_inline float4 max(const float4& a, const float4& b); +ccl_device_inline float4 fabs(const float4& a); #endif /* !__KERNEL_OPENCL__*/ #ifdef __KERNEL_SSE__ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b); +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& a, const float4& b); template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b); +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b); +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b); + # ifdef __KERNEL_SSE3__ template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b); template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b); @@ -77,9 +84,7 @@ ccl_device_inline float4 select(const int4& mask, const float4& b); ccl_device_inline float4 reduce_min(const float4& a); ccl_device_inline float4 reduce_max(const float4& a); -# if 0 ccl_device_inline float4 reduce_add(const float4& a); -# endif #endif /* !__KERNEL_GPU__ */ /******************************************************************************* @@ -128,7 +133,7 @@ ccl_device_inline float4 operator/(const float4& a, float f) ccl_device_inline float4 operator/(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return a * rcp(b); + return float4(_mm_div_ps(a.m128, b.m128)); #else return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); #endif @@ -224,14 +229,30 @@ ccl_device_inline float len_squared(const float4& a) ccl_device_inline float4 rcp(const float4& a) { #ifdef __KERNEL_SSE__ - float4 r(_mm_rcp_ps(a.m128)); - return float4(_mm_sub_ps(_mm_add_ps(r, r), - _mm_mul_ps(_mm_mul_ps(r, r), a))); + /* Don't use _mm_rcp_ps due to poor precision. */ + return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); #else return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); #endif } +ccl_device_inline float4 sqrt(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_sqrt_ps(a.m128)); +#else + return make_float4(sqrtf(a.x), + sqrtf(a.y), + sqrtf(a.z), + sqrtf(a.w)); +#endif +} + +ccl_device_inline float4 sqr(const float4& a) +{ + return a * a; +} + ccl_device_inline float4 cross(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ @@ -254,20 +275,24 @@ ccl_device_inline bool is_zero(const float4& a) #endif } -ccl_device_inline float reduce_add(const float4& a) +ccl_device_inline float4 reduce_add(const float4& a) { #ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE3__ + float4 h(_mm_hadd_ps(a.m128, a.m128)); + return float4( _mm_hadd_ps(h.m128, h.m128)); +# else float4 h(shuffle<1,0,3,2>(a) + a); - /* TODO(sergey): Investigate efficiency. */ - return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); + return shuffle<2,3,0,1>(h) + h; +# endif #else - return ((a.x + a.y) + (a.z + a.w)); + return make_float4(((a.x + a.y) + (a.z + a.w))); #endif } ccl_device_inline float average(const float4& a) { - return reduce_add(a) * 0.25f; + return reduce_add(a)[0] * 0.25f; } ccl_device_inline float len(const float4& a) @@ -309,6 +334,18 @@ ccl_device_inline float4 max(const float4& a, const float4& b) max(a.w, b.w)); #endif } + +ccl_device_inline float4 fabs(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); +#else + return make_float4(fabsf(a.x), + fabsf(a.y), + fabsf(a.z), + fabsf(a.w)); +#endif +} #endif /* !__KERNEL_OPENCL__*/ #ifdef __KERNEL_SSE__ @@ -320,11 +357,28 @@ __forceinline const float4 shuffle(const float4& b) _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); } +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& a, const float4& b) +{ + return float4(_mm_shuffle_ps(a.m128, b.m128, + _MM_SHUFFLE(index_3, index_2, index_1, index_0))); +} + template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) { return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)))); } +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b) +{ + return float4(_mm_movelh_ps(a.m128, b.m128)); +} + +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b) +{ + return float4(_mm_movehl_ps(b.m128, a.m128)); +} + # ifdef __KERNEL_SSE3__ template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) { @@ -344,9 +398,7 @@ ccl_device_inline float4 select(const int4& mask, const float4& b) { #ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), - _mm_andnot_ps(_mm_cvtepi32_ps(mask), b))); + return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128))); #else return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, @@ -355,6 +407,13 @@ ccl_device_inline float4 select(const int4& mask, #endif } +ccl_device_inline float4 mask(const int4& mask, + const float4& a) +{ + /* Replace elements of x with zero where mask isn't set. */ + return select(mask, a, make_float4(0.0f)); +} + ccl_device_inline float4 reduce_min(const float4& a) { #ifdef __KERNEL_SSE__ @@ -375,17 +434,15 @@ ccl_device_inline float4 reduce_max(const float4& a) #endif } -#if 0 -ccl_device_inline float4 reduce_add(const float4& a) +ccl_device_inline float4 load_float4(const float *v) { #ifdef __KERNEL_SSE__ - float4 h = shuffle<1,0,3,2>(a) + a; - return shuffle<2,3,0,1>(h) + h; + return float4(_mm_loadu_ps(v)); #else - return make_float4((a.x + a.y) + (a.z + a.w)); + return make_float4(v[0], v[1], v[2], v[3]); #endif } -#endif + #endif /* !__KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h index c7511f8306e..b31dbe4fc67 100644 --- a/intern/cycles/util/util_math_matrix.h +++ b/intern/cycles/util/util_math_matrix.h @@ -223,20 +223,20 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float { const float singular_epsilon = 1e-9f; - for (int row = 0; row < n; row++) { - for (int col = 0; col < n; col++) { + for(int row = 0; row < n; row++) { + for(int col = 0; col < n; col++) { MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f; } } - for (int sweep = 0; sweep < 8; sweep++) { + for(int sweep = 0; sweep < 8; sweep++) { float off_diagonal = 0.0f; - for (int row = 1; row < n; row++) { - for (int col = 0; col < row; col++) { + for(int row = 1; row < n; row++) { + for(int col = 0; col < row; col++) { off_diagonal += fabsf(MAT(A, n, row, col)); } } - if (off_diagonal < 1e-7f) { + if(off_diagonal < 1e-7f) { /* The matrix has nearly reached diagonal form. * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */ break; @@ -253,7 +253,7 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float float abs_element = fabsf(element); /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */ - if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) { + if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) { MAT(A, n, row, col) = 0.0f; continue; } @@ -272,10 +272,10 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float * Then, we compute sin(phi) and cos(phi) themselves. */ float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col); float ratio; - if (abs_element > singular_epsilon*fabsf(singular_diff)) { + if(abs_element > singular_epsilon*fabsf(singular_diff)) { float cot_2phi = 0.5f*singular_diff / element; ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi)); - if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */ + if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */ } else { ratio = element / singular_diff; @@ -315,21 +315,21 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float } /* Sort eigenvalues and the associated eigenvectors. */ - for (int i = 0; i < n - 1; i++) { + for(int i = 0; i < n - 1; i++) { float v = MAT(A, n, i, i); int k = i; - for (int j = i; j < n; j++) { - if (MAT(A, n, j, j) >= v) { + for(int j = i; j < n; j++) { + if(MAT(A, n, j, j) >= v) { v = MAT(A, n, j, j); k = j; } } - if (k != i) { + if(k != i) { /* Swap eigenvalues. */ MAT(A, n, k, k) = MAT(A, n, i, i); MAT(A, n, i, i) = v; /* Swap eigenvectors. */ - for (int j = 0; j < n; j++) { + for(int j = 0; j < n; j++) { float v = MATS(V, n, i, j, v_stride); MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride); MATS(V, n, k, j, v_stride) = v; @@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float } #ifdef __KERNEL_SSE3__ -ccl_device_inline void math_vector_zero_sse(__m128 *A, int n) +ccl_device_inline void math_vector_zero_sse(float4 *A, int n) { for(int i = 0; i < n; i++) { - A[i] = _mm_setzero_ps(); + A[i] = make_float4(0.0f); } } -ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n) +ccl_device_inline void math_matrix_zero_sse(float4 *A, int n) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_setzero_ps(); + MAT(A, n, row, col) = make_float4(0.0f); } } } /* Add Gramian matrix of v to A. * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */ -ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight) +ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight)); + MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight; } } } -ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a) +ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a) { for(int i = 0; i < n; i++) { - V[i] = _mm_add_ps(V[i], a[i]); + V[i] += a[i]; } } -ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a) +ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a) { for(int i = 0; i < n; i++) { - V[i] = _mm_mul_ps(V[i], a[i]); + V[i] *= a[i]; } } -ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n) +ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n) { for(int i = 0; i < n; i++) { - a[i] = _mm_max_ps(a[i], b[i]); + a[i] = max(a[i], b[i]); } } -ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B) +ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col)); + MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0]; } } } diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index 6f70a474fe7..0382c0811dd 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -19,16 +19,6 @@ #ifndef __KERNEL_GPU__ -/* quiet unused define warnings */ -#if defined(__KERNEL_SSE2__) || \ - defined(__KERNEL_SSE3__) || \ - defined(__KERNEL_SSSE3__) || \ - defined(__KERNEL_SSE41__) || \ - defined(__KERNEL_AVX__) || \ - defined(__KERNEL_AVX2__) - /* do nothing */ -#endif - /* x86 * * Compile a regular, SSE2 and SSE3 kernel. */ @@ -73,48 +63,6 @@ #endif /* defined(__x86_64__) || defined(_M_X64) */ -/* SSE Experiment - * - * This is disabled code for an experiment to use SSE types globally for types - * such as float3 and float4. Currently this gives an overall slowdown. */ - -#if 0 -# define __KERNEL_SSE__ -# ifndef __KERNEL_SSE2__ -# define __KERNEL_SSE2__ -# endif -# ifndef __KERNEL_SSE3__ -# define __KERNEL_SSE3__ -# endif -# ifndef __KERNEL_SSSE3__ -# define __KERNEL_SSSE3__ -# endif -# ifndef __KERNEL_SSE4__ -# define __KERNEL_SSE4__ -# endif -#endif - -/* SSE Intrinsics includes - * - * We assume __KERNEL_SSEX__ flags to have been defined at this point */ - -/* SSE intrinsics headers */ -#ifndef FREE_WINDOWS64 - -#ifdef _MSC_VER -# include <intrin.h> -#elif (defined(__x86_64__) || defined(__i386__)) -# include <x86intrin.h> -#endif - -#else - -/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. - * Since we can't avoid including <windows.h>, better only include that */ -#include "util/util_windows.h" - -#endif - #endif #endif /* __UTIL_OPTIMIZATION_H__ */ diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 587febe3e52..a2b3247b207 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -18,19 +18,38 @@ #ifndef __UTIL_SIMD_TYPES_H__ #define __UTIL_SIMD_TYPES_H__ +#ifndef __KERNEL_GPU__ + #include <limits> #include "util/util_debug.h" -#include "util/util_types.h" +#include "util/util_defines.h" + +/* SSE Intrinsics includes + * + * We assume __KERNEL_SSEX__ flags to have been defined at this point */ + +/* SSE intrinsics headers */ +#ifndef FREE_WINDOWS64 + +#ifdef _MSC_VER +# include <intrin.h> +#elif (defined(__x86_64__) || defined(__i386__)) +# include <x86intrin.h> +#endif + +#else + +/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. + * Since we can't avoid including <windows.h>, better only include that */ +#include "util/util_windows.h" + +#endif CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ -struct sseb; -struct ssei; -struct ssef; - extern const __m128 _mm_lookupmask_ps[16]; /* Special Types */ @@ -328,12 +347,9 @@ __forceinline size_t __bscf(size_t& v) #endif /* _WIN32 */ -static const unsigned int BITSCAN_NO_BIT_SET_32 = 32; -static const size_t BITSCAN_NO_BIT_SET_64 = 64; +#if !(defined(__SSE4_1__) || defined(__SSE4_2__)) -#ifdef __KERNEL_SSE3__ -/* Emulation of SSE4 functions with SSE3 */ -# ifndef __KERNEL_SSE41__ +/* Emulation of SSE4 functions with SSE2 */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 @@ -342,48 +358,31 @@ static const size_t BITSCAN_NO_BIT_SET_64 = 64; #define _MM_FROUND_CUR_DIRECTION 0x04 #undef _mm_blendv_ps -#define _mm_blendv_ps __emu_mm_blendv_ps __forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); } #undef _mm_blend_ps -#define _mm_blend_ps __emu_mm_blend_ps __forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); } #undef _mm_blendv_epi8 -#define _mm_blendv_epi8 __emu_mm_blendv_epi8 __forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); } -#undef _mm_mullo_epi32 -#define _mm_mullo_epi32 __emu_mm_mullo_epi32 -__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) { - __m128i rvalue; - char* _r = (char*)(&rvalue + 1); - char* _v = (char*)(& value + 1); - char* _i = (char*)(& input + 1); - for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))* *((int32_t*)(_i + i)); - return rvalue; -} - #undef _mm_min_epi32 -#define _mm_min_epi32 __emu_mm_min_epi32 __forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); } #undef _mm_max_epi32 -#define _mm_max_epi32 __emu_mm_max_epi32 __forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); } #undef _mm_extract_epi32 -#define _mm_extract_epi32 __emu_mm_extract_epi32 __forceinline int _mm_extract_epi32( __m128i input, const int index ) { switch ( index ) { case 0: return _mm_cvtsi128_si32(input); @@ -395,24 +394,15 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) { } #undef _mm_insert_epi32 -#define _mm_insert_epi32 __emu_mm_insert_epi32 __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; } -#undef _mm_extract_ps -#define _mm_extract_ps __emu_mm_extract_ps -__forceinline int _mm_extract_ps( __m128 input, const int index ) { - int32_t* ptr = (int32_t*)&input; return ptr[index]; -} - #undef _mm_insert_ps -#define _mm_insert_ps __emu_mm_insert_ps __forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index ) { assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); } #undef _mm_round_ps -#define _mm_round_ps __emu_mm_round_ps __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) { switch ( flags ) @@ -425,57 +415,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) return value; } -# ifdef _M_X64 -#undef _mm_insert_epi64 -#define _mm_insert_epi64 __emu_mm_insert_epi64 -__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { - assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; -} - -#undef _mm_extract_epi64 -#define _mm_extract_epi64 __emu_mm_extract_epi64 -__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { - assert(size_t(index) < 2); - return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); -} -# endif - -# endif - -#undef _mm_fabs_ps -#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))) - -/* Return a __m128 with every element set to the largest element of v. */ -ccl_device_inline __m128 _mm_hmax_ps(__m128 v) -{ - /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */ - v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v)); - /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */ - v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v)); - return v; -} - -/* Return the sum of the four elements of x. */ -ccl_device_inline float _mm_hsum_ss(__m128 x) -{ - __m128 a = _mm_movehdup_ps(x); - __m128 b = _mm_add_ps(x, a); - return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b)); -} - -/* Return a __m128 with every element set to the sum of the four elements of x. */ -ccl_device_inline __m128 _mm_hsum_ps(__m128 x) -{ - x = _mm_hadd_ps(x, x); - x = _mm_hadd_ps(x, x); - return x; -} - -/* Replace elements of x with zero where mask isn't set. */ -#undef _mm_mask_ps -#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask) - -#endif +#endif /* !(defined(__SSE4_1__) || defined(__SSE4_2__)) */ #else /* __KERNEL_SSE2__ */ @@ -496,13 +436,19 @@ ccl_device_inline int bitscan(int value) #endif /* __KERNEL_SSE2__ */ +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) || \ + defined(__KERNEL_SSE3__) || \ + defined(__KERNEL_SSSE3__) || \ + defined(__KERNEL_SSE41__) || \ + defined(__KERNEL_AVX__) || \ + defined(__KERNEL_AVX2__) + /* do nothing */ +#endif + CCL_NAMESPACE_END -#include "util/util_math.h" -#include "util/util_sseb.h" -#include "util/util_ssei.h" -#include "util/util_ssef.h" -#include "util/util_avxf.h" +#endif /* __KERNEL_GPU__ */ #endif /* __UTIL_SIMD_TYPES_H__ */ diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h index 6e669701f3b..93c22aafdcd 100644 --- a/intern/cycles/util/util_sseb.h +++ b/intern/cycles/util/util_sseb.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct ssei; +struct ssef; + /*! 4-wide SSE bool type. */ struct sseb { diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h index cf99a08efae..bb007ff84a9 100644 --- a/intern/cycles/util/util_ssef.h +++ b/intern/cycles/util/util_ssef.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct sseb; +struct ssef; + /*! 4-wide SSE float type. */ struct ssef { diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h index 5f62569268c..ef2a9e68b7d 100644 --- a/intern/cycles/util/util_ssei.h +++ b/intern/cycles/util/util_ssei.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct sseb; +struct ssef; + /*! 4-wide SSE integer type. */ struct ssei { @@ -234,8 +237,10 @@ __forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a #else -__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); } -__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); } +__forceinline int ssei_min(int a, int b) { return (a < b)? a: b; } +__forceinline int ssei_max(int a, int b) { return (a > b)? a: b; } +__forceinline int reduce_min(const ssei& v) { return ssei_min(ssei_min(v[0],v[1]),ssei_min(v[2],v[3])); } +__forceinline int reduce_max(const ssei& v) { return ssei_max(ssei_max(v[0],v[1]),ssei_max(v[2],v[3])); } __forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; } #endif diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index a5d1d7152d5..d9642df8005 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -21,72 +21,17 @@ # include <stdlib.h> #endif -/* Bitness */ +/* Standard Integer Types */ -#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) -# define __KERNEL_64_BIT__ +#if !defined(__KERNEL_GPU__) && !defined(_WIN32) +# include <stdint.h> #endif -/* Qualifiers for kernel code shared by CPU and GPU */ - -#ifndef __KERNEL_GPU__ -# define ccl_device static inline -# define ccl_device_noinline static -# define ccl_global -# define ccl_constant -# define ccl_local -# define ccl_local_param -# define ccl_private -# define ccl_restrict __restrict -# define __KERNEL_WITH_SSE_ALIGN__ - -# if defined(_WIN32) && !defined(FREE_WINDOWS) -# define ccl_device_inline static __forceinline -# define ccl_device_forceinline static __forceinline -# define ccl_align(...) __declspec(align(__VA_ARGS__)) -# ifdef __KERNEL_64_BIT__ -# define ccl_try_align(...) __declspec(align(__VA_ARGS__)) -# else /* __KERNEL_64_BIT__ */ -# undef __KERNEL_WITH_SSE_ALIGN__ -/* No support for function arguments (error C2719). */ -# define ccl_try_align(...) -# endif /* __KERNEL_64_BIT__ */ -# define ccl_may_alias -# define ccl_always_inline __forceinline -# define ccl_never_inline __declspec(noinline) -# define ccl_maybe_unused -# else /* _WIN32 && !FREE_WINDOWS */ -# define ccl_device_inline static inline __attribute__((always_inline)) -# define ccl_device_forceinline static inline __attribute__((always_inline)) -# define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) -# ifndef FREE_WINDOWS64 -# define __forceinline inline __attribute__((always_inline)) -# endif -# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) -# define ccl_may_alias __attribute__((__may_alias__)) -# define ccl_always_inline __attribute__((always_inline)) -# define ccl_never_inline __attribute__((noinline)) -# define ccl_maybe_unused __attribute__((used)) -# endif /* _WIN32 && !FREE_WINDOWS */ - -/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ -# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ -# define ATTR_FALLTHROUGH __attribute__((fallthrough)) -# else -# define ATTR_FALLTHROUGH ((void)0) -# endif -#endif /* __KERNEL_GPU__ */ - -/* Standard Integer Types */ +#include "util/util_defines.h" #ifndef __KERNEL_GPU__ -/* int8_t, uint16_t, and friends */ -# ifndef _WIN32 -# include <stdint.h> -# endif -/* SIMD Types */ -# include "util/util_optimization.h" -#endif /* __KERNEL_GPU__ */ +# include "util/util_simd.h" +#endif CCL_NAMESPACE_BEGIN @@ -201,65 +146,8 @@ enum ExtensionType { EXTENSION_NUM_TYPES, }; -/* macros */ - -/* hints for branch prediction, only use in code that runs a _lot_ */ -#if defined(__GNUC__) && defined(__KERNEL_CPU__) -# define LIKELY(x) __builtin_expect(!!(x), 1) -# define UNLIKELY(x) __builtin_expect(!!(x), 0) -#else -# define LIKELY(x) (x) -# define UNLIKELY(x) (x) -#endif - -#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800)) -# define HAS_CPP11_FEATURES -#endif - -#if defined(__GNUC__) || defined(__clang__) -# if defined(HAS_CPP11_FEATURES) -/* Some magic to be sure we don't have reference in the type. */ -template<typename T> static inline T decltype_helper(T x) { return x; } -# define TYPEOF(x) decltype(decltype_helper(x)) -# else -# define TYPEOF(x) typeof(x) -# endif -#endif - -/* Causes warning: - * incompatible types when assigning to type 'Foo' from type 'Bar' - * ... the compiler optimizes away the temp var */ -#ifdef __GNUC__ -#define CHECK_TYPE(var, type) { \ - TYPEOF(var) *__tmp; \ - __tmp = (type *)NULL; \ - (void)__tmp; \ -} (void)0 - -#define CHECK_TYPE_PAIR(var_a, var_b) { \ - TYPEOF(var_a) *__tmp; \ - __tmp = (typeof(var_b) *)NULL; \ - (void)__tmp; \ -} (void)0 -#else -# define CHECK_TYPE(var, type) -# define CHECK_TYPE_PAIR(var_a, var_b) -#endif - -/* can be used in simple macros */ -#define CHECK_TYPE_INLINE(val, type) \ - ((void)(((type)0) != (val))) - - CCL_NAMESPACE_END -#ifndef __KERNEL_GPU__ -# include <cassert> -# define util_assert(statement) assert(statement) -#else -# define util_assert(statement) -#endif - /* Vectorized types declaration. */ #include "util/util_types_uchar2.h" #include "util/util_types_uchar3.h" @@ -298,5 +186,13 @@ CCL_NAMESPACE_END #include "util/util_types_vector3_impl.h" +/* SSE types. */ +#ifndef __KERNEL_GPU__ +# include "util/util_sseb.h" +# include "util/util_ssei.h" +# include "util/util_ssef.h" +# include "util/util_avxf.h" +#endif + #endif /* __UTIL_TYPES_H__ */ diff --git a/source/blender/editors/space_clip/space_clip.c b/source/blender/editors/space_clip/space_clip.c index 9af3ebf3cbb..1872fe108ca 100644 --- a/source/blender/editors/space_clip/space_clip.c +++ b/source/blender/editors/space_clip/space_clip.c @@ -821,7 +821,8 @@ static void clip_keymap(struct wmKeyConfig *keyconf) #endif } -static const char *clip_context_dir[] = {"edit_movieclip", "edit_mask", NULL}; +/* DO NOT make this static, this hides the symbol and breaks API generation script. */ +const char *clip_context_dir[] = {"edit_movieclip", "edit_mask", NULL}; static int clip_context(const bContext *C, const char *member, bContextDataResult *result) { diff --git a/source/blender/editors/space_sequencer/space_sequencer.c b/source/blender/editors/space_sequencer/space_sequencer.c index 5dfcba9b4d1..3b04e6c80cd 100644 --- a/source/blender/editors/space_sequencer/space_sequencer.c +++ b/source/blender/editors/space_sequencer/space_sequencer.c @@ -436,7 +436,8 @@ static void sequencer_dropboxes(void) /* ************* end drop *********** */ -static const char *sequencer_context_dir[] = {"edit_mask", NULL}; +/* DO NOT make this static, this hides the symbol and breaks API generation script. */ +const char *sequencer_context_dir[] = {"edit_mask", NULL}; static int sequencer_context(const bContext *C, const char *member, bContextDataResult *result) { diff --git a/source/blender/windowmanager/intern/wm_files.c b/source/blender/windowmanager/intern/wm_files.c index cedf50a3035..221baeadbee 100644 --- a/source/blender/windowmanager/intern/wm_files.c +++ b/source/blender/windowmanager/intern/wm_files.c @@ -729,20 +729,20 @@ int wm_homefile_read( if (filepath_startup_override != NULL) { /* pass */ } - else if (app_template_override && app_template_override[0]) { + else if (app_template_override) { + /* This may be clearing the current template by setting to an empty string. */ app_template = app_template_override; } else if (!use_factory_settings && U.app_template[0]) { app_template = U.app_template; } - if (app_template != NULL) { + if ((app_template != NULL) && (app_template[0] != '\0')) { BKE_appdir_app_template_id_search(app_template, app_template_system, sizeof(app_template_system)); BLI_path_join(app_template_config, sizeof(app_template_config), cfgdir, app_template, NULL); - } - /* insert template name into startup file */ - if (app_template != NULL) { + /* Insert template name into startup file. */ + /* note that the path is being set even when 'use_factory_settings == true' * this is done so we can load a templates factory-settings */ if (!use_factory_settings) { diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt index 28ce095b0e8..afd61d5ab67 100644 --- a/tests/python/CMakeLists.txt +++ b/tests/python/CMakeLists.txt @@ -518,6 +518,7 @@ if(WITH_CYCLES) -blender "$<TARGET_FILE:blender>" -testdir "${TEST_SRC_DIR}/cycles/ctests/${subject}" -idiff "${OPENIMAGEIO_IDIFF}" + -outdir "${TEST_OUT_DIR}/cycles" ) else() add_test( @@ -526,17 +527,23 @@ if(WITH_CYCLES) -blender "$<TARGET_FILE:blender>" -testdir "${TEST_SRC_DIR}/cycles/ctests/${subject}" -idiff "${OPENIMAGEIO_IDIFF}" + -outdir "${TEST_OUT_DIR}/cycles" ) endif() endmacro() if(WITH_OPENGL_TESTS) add_cycles_render_test(opengl) endif() - add_cycles_render_test(image) + add_cycles_render_test(displacement) + add_cycles_render_test(image_data_types) + add_cycles_render_test(image_mapping) + add_cycles_render_test(image_texture_limit) add_cycles_render_test(mblur) add_cycles_render_test(reports) add_cycles_render_test(render) add_cycles_render_test(shader) + add_cycles_render_test(shadow_catcher) + add_cycles_render_test(volume) else() MESSAGE(STATUS "Disabling Cycles tests because tests folder does not exist") endif() diff --git a/tests/python/cycles_render_tests.py b/tests/python/cycles_render_tests.py index a030cc5e0de..ea84f27ab7e 100755 --- a/tests/python/cycles_render_tests.py +++ b/tests/python/cycles_render_tests.py @@ -2,7 +2,9 @@ # Apache License, Version 2.0 import argparse +import glob import os +import pathlib import shutil import subprocess import sys @@ -24,7 +26,7 @@ class COLORS_DUMMY: COLORS = COLORS_DUMMY -def printMessage(type, status, message): +def print_message(message, type=None, status=''): if type == 'SUCCESS': print(COLORS.GREEN, end="") elif type == 'FAILURE': @@ -109,20 +111,126 @@ def test_get_name(filepath): filename = os.path.basename(filepath) return os.path.splitext(filename)[0] - -def verify_output(filepath): +def test_get_images(filepath): testname = test_get_name(filepath) dirpath = os.path.dirname(filepath) - reference_dirpath = os.path.join(dirpath, "reference_renders") - reference_image = os.path.join(reference_dirpath, testname + ".png") - failed_image = os.path.join(reference_dirpath, testname + ".fail.png") - if not os.path.exists(reference_image): + ref_dirpath = os.path.join(dirpath, "reference_renders") + ref_img = os.path.join(ref_dirpath, testname + ".png") + new_dirpath = os.path.join(OUTDIR, os.path.basename(dirpath)) + if not os.path.exists(new_dirpath): + os.makedirs(new_dirpath) + new_img = os.path.join(new_dirpath, testname + ".png") + diff_dirpath = os.path.join(OUTDIR, os.path.basename(dirpath), "diff") + if not os.path.exists(diff_dirpath): + os.makedirs(diff_dirpath) + diff_img = os.path.join(diff_dirpath, testname + ".diff.png") + return ref_img, new_img, diff_img + + +class Report: + def __init__(self, testname): + self.failed_tests = "" + self.passed_tests = "" + self.testname = testname + + def output(self): + # write intermediate data for single test + outdir = os.path.join(OUTDIR, self.testname) + f = open(os.path.join(outdir, "failed.data"), "w") + f.write(self.failed_tests) + f.close() + + f = open(os.path.join(outdir, "passed.data"), "w") + f.write(self.passed_tests) + f.close() + + # gather intermediate data for all tests + failed_data = sorted(glob.glob(os.path.join(OUTDIR, "*/failed.data"))) + passed_data = sorted(glob.glob(os.path.join(OUTDIR, "*/passed.data"))) + + failed_tests = "" + passed_tests = "" + + for filename in failed_data: + failed_tests += open(os.path.join(OUTDIR, filename), "r").read() + for filename in passed_data: + passed_tests += open(os.path.join(OUTDIR, filename), "r").read() + + # write html for all tests + self.html = """ +<html> +<head> + <title>Cycles Test Report</title> + <style> + img {{ image-rendering: pixelated; width: 256; background-color: #000; }} + table td:first-child {{ width: 100%; }} + </style> + <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css"> +</head> +<body> + <div class="container"> + <br/> + <h1>Cycles Test Report</h1> + <br/> + <table class="table table-striped"> + <thead class="thead-default"> + <tr><th>Name</th><th>New</th><th>Reference</th><th>Diff</th> + </thead> + {}{} + </table> + <br/> + </div> +</body> +</html> + """ . format(failed_tests, passed_tests) + + filepath = os.path.join(OUTDIR, "report.html") + f = open(filepath, "w") + f.write(self.html) + f.close() + + print_message("Report saved to: " + pathlib.Path(filepath).as_uri()) + + def add_test(self, filepath, error): + name = test_get_name(filepath) + + ref_img, new_img, diff_img = test_get_images(filepath) + + status = error if error else "" + style = """ style="background-color: #f99;" """ if error else "" + + new_url = pathlib.Path(new_img).as_uri() + ref_url = pathlib.Path(ref_img).as_uri() + diff_url = pathlib.Path(diff_img).as_uri() + + test_html = """ + <tr{}> + <td><b>{}</b><br/>{}<br/>{}</td> + <td><img src="{}" onmouseover="this.src='{}';" onmouseout="this.src='{}';"></td> + <td><img src="{}" onmouseover="this.src='{}';" onmouseout="this.src='{}';"></td> + <td><img src="{}"></td> + </tr>""" . format(style, name, self.testname, status, + new_url, ref_url, new_url, + ref_url, new_url, ref_url, + diff_url) + + if error: + self.failed_tests += test_html + else: + self.passed_tests += test_html + + +def verify_output(report, filepath): + ref_img, new_img, diff_img = test_get_images(filepath) + if not os.path.exists(ref_img): return False + + # diff test with threshold command = ( IDIFF, - "-fail", "0.015", + "-fail", "0.016", "-failpercent", "1", - reference_image, + ref_img, TEMP_FILE, ) try: @@ -130,47 +238,66 @@ def verify_output(filepath): failed = False except subprocess.CalledProcessError as e: if VERBOSE: - print(e.output.decode("utf-8")) + print_message(e.output.decode("utf-8")) failed = e.returncode != 1 - if failed: - shutil.copy(TEMP_FILE, failed_image) - elif os.path.exists(failed_image): - os.remove(failed_image) + + # generate diff image + command = ( + IDIFF, + "-o", diff_img, + "-abs", "-scale", "16", + ref_img, + TEMP_FILE + ) + + try: + subprocess.check_output(command) + except subprocess.CalledProcessError as e: + if VERBOSE: + print_message(e.output.decode("utf-8")) + + # copy new image + if os.path.exists(new_img): + os.remove(new_img) + if os.path.exists(TEMP_FILE): + shutil.copy(TEMP_FILE, new_img) + return not failed -def run_test(filepath): +def run_test(report, filepath): testname = test_get_name(filepath) spacer = "." * (32 - len(testname)) - printMessage('SUCCESS', 'RUN', testname) + print_message(testname, 'SUCCESS', 'RUN') time_start = time.time() error = render_file(filepath) status = "FAIL" if not error: - if not verify_output(filepath): + if not verify_output(report, filepath): error = "VERIFY" time_end = time.time() elapsed_ms = int((time_end - time_start) * 1000) if not error: - printMessage('SUCCESS', 'OK', "{} ({} ms)" . - format(testname, elapsed_ms)) + print_message("{} ({} ms)" . format(testname, elapsed_ms), + 'SUCCESS', 'OK') else: if error == "NO_CYCLES": - print("Can't perform tests because Cycles failed to load!") - return False + print_message("Can't perform tests because Cycles failed to load!") + return error elif error == "NO_START": - print('Can not perform tests because blender fails to start.', + print_message('Can not perform tests because blender fails to start.', 'Make sure INSTALL target was run.') - return False + return error elif error == 'VERIFY': - print("Rendered result is different from reference image") + print_message("Rendered result is different from reference image") else: - print("Unknown error %r" % error) - printMessage('FAILURE', 'FAILED', "{} ({} ms)" . - format(testname, elapsed_ms)) + print_message("Unknown error %r" % error) + print_message("{} ({} ms)" . format(testname, elapsed_ms), + 'FAILURE', 'FAILED') return error + def blend_list(path): for dirpath, dirnames, filenames in os.walk(path): for filename in filenames: @@ -178,17 +305,18 @@ def blend_list(path): filepath = os.path.join(dirpath, filename) yield filepath - def run_all_tests(dirpath): passed_tests = [] failed_tests = [] all_files = list(blend_list(dirpath)) all_files.sort() - printMessage('SUCCESS', "==========", - "Running {} tests from 1 test case." . format(len(all_files))) + report = Report(os.path.basename(dirpath)) + print_message("Running {} tests from 1 test case." . + format(len(all_files)), + 'SUCCESS', "==========") time_start = time.time() for filepath in all_files: - error = run_test(filepath) + error = run_test(report, filepath) testname = test_get_name(filepath) if error: if error == "NO_CYCLES": @@ -198,28 +326,33 @@ def run_all_tests(dirpath): failed_tests.append(testname) else: passed_tests.append(testname) + report.add_test(filepath, error) time_end = time.time() elapsed_ms = int((time_end - time_start) * 1000) - print("") - printMessage('SUCCESS', "==========", - "{} tests from 1 test case ran. ({} ms total)" . - format(len(all_files), elapsed_ms)) - printMessage('SUCCESS', 'PASSED', "{} tests." . - format(len(passed_tests))) + print_message("") + print_message("{} tests from 1 test case ran. ({} ms total)" . + format(len(all_files), elapsed_ms), + 'SUCCESS', "==========") + print_message("{} tests." . + format(len(passed_tests)), + 'SUCCESS', 'PASSED') if failed_tests: - printMessage('FAILURE', 'FAILED', "{} tests, listed below:" . - format(len(failed_tests))) + print_message("{} tests, listed below:" . + format(len(failed_tests)), + 'FAILURE', 'FAILED') failed_tests.sort() for test in failed_tests: - printMessage('FAILURE', "FAILED", "{}" . format(test)) - return False - return True + print_message("{}" . format(test), 'FAILURE', "FAILED") + + report.output() + return not bool(failed_tests) def create_argparse(): parser = argparse.ArgumentParser() parser.add_argument("-blender", nargs="+") parser.add_argument("-testdir", nargs=1) + parser.add_argument("-outdir", nargs=1) parser.add_argument("-idiff", nargs=1) return parser @@ -229,7 +362,7 @@ def main(): args = parser.parse_args() global COLORS - global BLENDER, ROOT, IDIFF + global BLENDER, TESTDIR, IDIFF, OUTDIR global TEMP_FILE, TEMP_FILE_MASK, TEST_SCRIPT global VERBOSE @@ -237,8 +370,12 @@ def main(): COLORS = COLORS_ANSI BLENDER = args.blender[0] - ROOT = args.testdir[0] + TESTDIR = args.testdir[0] IDIFF = args.idiff[0] + OUTDIR = args.outdir[0] + + if not os.path.exists(OUTDIR): + os.makedirs(OUTDIR) TEMP = tempfile.mkdtemp() TEMP_FILE_MASK = os.path.join(TEMP, "test") @@ -248,7 +385,7 @@ def main(): VERBOSE = os.environ.get("BLENDER_VERBOSE") is not None - ok = run_all_tests(ROOT) + ok = run_all_tests(TESTDIR) # Cleanup temp files and folders if os.path.exists(TEMP_FILE): |