Cycles: workaround for noise performance regression in CUDA 5.5

Use manual ternary operation widening in grad(). Without it nvcc 5.5 produces multiple branch splits with very big branches (because of inlining). This solves 19% performance regression for BMW1M-MikePan.blend. Also remove one redundant instruction in perlin SSE (when h == 12 or h == 14, then h is always >= 4). Reviewed By: brecht Differential Revision: https://developer.blender.org/D190
author: Sv. Lockal <lockalsash@gmail.com> 2014-01-08 22:25:55 +0400
committer: Sv. Lockal <lockalsash@gmail.com> 2014-01-08 22:25:55 +0400
commit: 20b046d7639c6c95d942760f8004c449fc42b081 (patch)
tree: ecd6332fbb37a260c8ef93aa88a0a3750962f9b8 /intern/cycles/kernel/svm/svm_noise.h
parent: 61c9cacbd73f08536196d406a73aca83cd2a87f1 (diff)
1 files changed, 5 insertions, 5 deletions
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index add5f4c1ce1..a58dfdf4f9a 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -167,7 +167,8 @@ ccl_device float grad(int hash, float x, float y, float z)
 	// use vectors pointing to the edges of the cube
 	int h = hash & 15;
 	float u = h<8 ? x : y;
-	float v = h<4 ? y : h == 12 || h == 14 ? x : z;
+	float vt = ((h == 12) | (h == 14)) ? x : z;
+	float v = h < 4 ? y : vt;
 	return ((h&1) ? -u : u) + ((h&2) ? -v : v);
 }
 
@@ -185,11 +186,10 @@ ccl_device_inline __m128 grad_sse(const __m128i *hash, const __m128 *x, const __
 
 	__m128i case_vy = _mm_cmplt_epi32(h, _mm_set1_epi32(4));       // 0xffffffff if h < 4 else 0
 
-	__m128i case_h12_raw = _mm_cmpeq_epi32(h, _mm_set1_epi32(12)); // 0xffffffff if h == 12 else 0
-	__m128i case_h14_raw = _mm_cmpeq_epi32(h, _mm_set1_epi32(14)); // 0xffffffff if h == 14 else 0
+	__m128i case_h12 = _mm_cmpeq_epi32(h, _mm_set1_epi32(12));     // 0xffffffff if h == 12 else 0
+	__m128i case_h14 = _mm_cmpeq_epi32(h, _mm_set1_epi32(14));     // 0xffffffff if h == 14 else 0
 
-	__m128i case_vxtmp = _mm_or_si128(case_h12_raw, case_h14_raw); // 0xffffffff if h == 12 or h == 14 else 0
-	__m128i case_vx = _mm_andnot_si128(case_vy, case_vxtmp);       // 0xffffffff if (h == 12 or h == 14) and not(h<4)
+	__m128i case_vx = _mm_or_si128(case_h12, case_h14);            // 0xffffffff if h == 12 or h == 14 else 0
 
 	__m128 v = blend(_mm_castsi128_ps(case_vy), *y, blend(_mm_castsi128_ps(case_vx), *x, *z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z
author	Sv. Lockal <lockalsash@gmail.com>	2014-01-08 22:25:55 +0400
committer	Sv. Lockal <lockalsash@gmail.com>	2014-01-08 22:25:55 +0400
commit	20b046d7639c6c95d942760f8004c449fc42b081 (patch)
tree	ecd6332fbb37a260c8ef93aa88a0a3750962f9b8 /intern/cycles/kernel/svm/svm_noise.h
parent	61c9cacbd73f08536196d406a73aca83cd2a87f1 (diff)