Cycles: Use utility define for restrict pointers

This way restrict can be used for CUDA and OpenCL as well. From quick tests in areas i've been testing this it might give some barely measurable %% of speedup, but it increases registers pressure. So use of this qualifier is still really limited.
author: Sergey Sharybin <sergey.vfx@gmail.com> 2016-07-11 14:53:37 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2016-07-11 14:58:47 +0300
commit: cb3b19730c4fa402c065e288330f4f1f197026ab (patch)
tree: 9ecbafc8af4ad7a1027a47eddc6fc0b8b7ce49e6 /intern/cycles/kernel
parent: cf82b49a0fd116d87b4c7e96e39bb02fb9e964bf (diff)
3 files changed, 23 insertions, 21 deletions
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 6dfb1c08e27..a833f4b1248 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -22,27 +22,27 @@ struct QBVHStackItem {
 /* TOOD(sergey): Investigate if using intrinsics helps for both
  * stack item swap and float comparison.
  */
-ccl_device_inline void qbvh_item_swap(QBVHStackItem *__restrict a,
-                                      QBVHStackItem *__restrict b)
+ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a,
+                                      QBVHStackItem *ccl_restrict b)
 {
 	QBVHStackItem tmp = *a;
 	*a = *b;
 	*b = tmp;
 }
 
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
-                                       QBVHStackItem *__restrict s2,
-                                       QBVHStackItem *__restrict s3)
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
+                                       QBVHStackItem *ccl_restrict s2,
+                                       QBVHStackItem *ccl_restrict s3)
 {
 	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
 	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
 	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
 }
 
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
-                                       QBVHStackItem *__restrict s2,
-                                       QBVHStackItem *__restrict s3,
-                                       QBVHStackItem *__restrict s4)
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
+                                       QBVHStackItem *ccl_restrict s2,
+                                       QBVHStackItem *ccl_restrict s3,
+                                       QBVHStackItem *ccl_restrict s4)
 {
 	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
 	if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); }
@@ -53,7 +53,7 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
 
 /* Axis-aligned nodes intersection */
 
-ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
+ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
                                                   const ssef& tnear,
                                                   const ssef& tfar,
 #ifdef __KERNEL_AVX2__
@@ -69,7 +69,7 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
                                                   const int far_y,
                                                   const int far_z,
                                                   const int node_addr,
-                                                  ssef *__restrict dist)
+                                                  ssef *ccl_restrict dist)
 {
 	const int offset = node_addr + 1;
 #ifdef __KERNEL_AVX2__
@@ -104,7 +104,7 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
 }
 
 ccl_device_inline int qbvh_aligned_node_intersect_robust(
-        KernelGlobals *__restrict kg,
+        KernelGlobals *ccl_restrict kg,
         const ssef& tnear,
         const ssef& tfar,
 #ifdef __KERNEL_AVX2__
@@ -121,7 +121,7 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust(
         const int far_z,
         const int node_addr,
         const float difl,
-        ssef *__restrict dist)
+        ssef *ccl_restrict dist)
 {
 	const int offset = node_addr + 1;
 #ifdef __KERNEL_AVX2__
@@ -152,7 +152,7 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust(
 /* Unaligned nodes intersection */
 
 ccl_device_inline int qbvh_unaligned_node_intersect(
-        KernelGlobals *__restrict kg,
+        KernelGlobals *ccl_restrict kg,
         const ssef& tnear,
         const ssef& tfar,
 #ifdef __KERNEL_AVX2__
@@ -168,7 +168,7 @@ ccl_device_inline int qbvh_unaligned_node_intersect(
         const int far_y,
         const int far_z,
         const int node_addr,
-        ssef *__restrict dist)
+        ssef *ccl_restrict dist)
 {
 	const int offset = node_addr;
 	const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
@@ -236,7 +236,7 @@ ccl_device_inline int qbvh_unaligned_node_intersect(
 }
 
 ccl_device_inline int qbvh_unaligned_node_intersect_robust(
-        KernelGlobals *__restrict kg,
+        KernelGlobals *ccl_restrict kg,
         const ssef& tnear,
         const ssef& tfar,
 #ifdef __KERNEL_AVX2__
@@ -253,7 +253,7 @@ ccl_device_inline int qbvh_unaligned_node_intersect_robust(
         const int far_z,
         const int node_addr,
         const float difl,
-        ssef *__restrict dist)
+        ssef *ccl_restrict dist)
 {
 	const int offset = node_addr;
 	const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
@@ -324,7 +324,7 @@ ccl_device_inline int qbvh_unaligned_node_intersect_robust(
  */
 
 ccl_device_inline int qbvh_node_intersect(
-        KernelGlobals *__restrict kg,
+        KernelGlobals *ccl_restrict kg,
         const ssef& tnear,
         const ssef& tfar,
 #ifdef __KERNEL_AVX2__
@@ -340,7 +340,7 @@ ccl_device_inline int qbvh_node_intersect(
         const int far_y,
         const int far_z,
         const int node_addr,
-        ssef *__restrict dist)
+        ssef *ccl_restrict dist)
 {
 	const int offset = node_addr;
 	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
@@ -377,7 +377,7 @@ ccl_device_inline int qbvh_node_intersect(
 }
 
 ccl_device_inline int qbvh_node_intersect_robust(
-        KernelGlobals *__restrict kg,
+        KernelGlobals *ccl_restrict kg,
         const ssef& tnear,
         const ssef& tfar,
 #ifdef __KERNEL_AVX2__
@@ -394,7 +394,7 @@ ccl_device_inline int qbvh_node_intersect_robust(
         const int far_z,
         const int node_addr,
         const float difl,
-        ssef *__restrict dist)
+        ssef *ccl_restrict dist)
 {
 	const int offset = node_addr;
 	const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 42314756f02..08f6f457805 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -42,6 +42,7 @@
 #define ccl_constant
 #define ccl_may_alias
 #define ccl_addr_space
+#define ccl_restrict __restrict__
 
 /* No assert supported for CUDA */
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index a5708448e23..8505cb85576 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -39,6 +39,7 @@
 #define ccl_global __global
 #define ccl_local __local
 #define ccl_private __private
+#define ccl_restrict restrict
 
 #ifdef __SPLIT_KERNEL__
 #  define ccl_addr_space __global
author	Sergey Sharybin <sergey.vfx@gmail.com>	2016-07-11 14:53:37 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2016-07-11 14:58:47 +0300
commit	cb3b19730c4fa402c065e288330f4f1f197026ab (patch)
tree	9ecbafc8af4ad7a1027a47eddc6fc0b8b7ce49e6 /intern/cycles/kernel
parent	cf82b49a0fd116d87b4c7e96e39bb02fb9e964bf (diff)