From 95fe9b26176ef7c129f55f1c33228dfa5b2ab1c3 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Mon, 7 Aug 2017 19:47:31 +0200
Subject: Cycles: Cleanup, remove bvh prefix from curve functions

Those are nothing to do with BVH, and can be used separately.
---
 intern/cycles/kernel/bvh/bvh_shadow_all.h  | 44 +++++++++++++--------------
 intern/cycles/kernel/bvh/bvh_traversal.h   | 48 +++++++++++++++---------------
 intern/cycles/kernel/bvh/qbvh_shadow_all.h | 44 +++++++++++++--------------
 intern/cycles/kernel/bvh/qbvh_traversal.h  | 48 +++++++++++++++---------------
 intern/cycles/kernel/geom/geom_curve.h     | 47 ++++++++++++++++++++++++-----
 intern/cycles/kernel/kernel_shader.h       |  2 +-
 6 files changed, 133 insertions(+), 100 deletions(-)

diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 72188e3a845..a6a4353562c 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -221,30 +221,30 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 							case PRIMITIVE_MOTION_CURVE: {
 								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect_array,
-									                                   P,
-									                                   dir,
-									                                   visibility,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   NULL,
-									                                   0, 0);
+									hit = cardinal_curve_intersect(kg,
+									                               isect_array,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               NULL,
+									                               0, 0);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect_array,
-									                          P,
-									                          dir,
-									                          visibility,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          NULL,
-									                          0, 0);
+									hit = curve_intersect(kg,
+									                      isect_array,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      NULL,
+									                      0, 0);
 								}
 								break;
 							}
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index bc09237b975..ae8f54821f2 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -298,32 +298,32 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect,
-									                                   P,
-									                                   dir,
-									                                   visibility,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   lcg_state,
-									                                   difl,
-									                                   extmax);
+									hit = cardinal_curve_intersect(kg,
+									                               isect,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               lcg_state,
+									                               difl,
+									                               extmax);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect,
-									                          P,
-									                          dir,
-									                          visibility,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          lcg_state,
-									                          difl,
-									                          extmax);
+									hit = curve_intersect(kg,
+									                      isect,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      lcg_state,
+									                      difl,
+									                      extmax);
 								}
 								if(hit) {
 									/* shadow ray early termination */
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index 0fa8d4323c6..522213f30ca 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -303,30 +303,30 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 							case PRIMITIVE_MOTION_CURVE: {
 								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect_array,
-									                                   P,
-									                                   dir,
-									                                   visibility,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   NULL,
-									                                   0, 0);
+									hit = cardinal_curve_intersect(kg,
+									                               isect_array,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               NULL,
+									                               0, 0);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect_array,
-									                          P,
-									                          dir,
-									                          visibility,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          NULL,
-									                          0, 0);
+									hit = curve_intersect(kg,
+									                      isect_array,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      NULL,
+									                      0, 0);
 								}
 								break;
 							}
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index 8e0084e3914..335a4afd47a 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -379,32 +379,32 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect,
-									                                   P,
-									                                   dir,
-									                                   visibility,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   lcg_state,
-									                                   difl,
-									                                   extmax);
+									hit = cardinal_curve_intersect(kg,
+									                               isect,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               lcg_state,
+									                               difl,
+									                               extmax);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect,
-									                          P,
-									                          dir,
-									                          visibility,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          lcg_state,
-									                          difl,
-									                          extmax);
+									hit = curve_intersect(kg,
+									                      isect,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      lcg_state,
+									                      difl,
+									                      extmax);
 								}
 								if(hit) {
 									tfar = ssef(isect->t);
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 5c3b0ee3c15..29b357e33d6 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -228,11 +228,31 @@ ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
 
 #ifdef __KERNEL_SSE2__
 /* Pass P and dir by reference to aligned vector */
-ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
+ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
+                                                        Intersection *isect,
+                                                        const float3 &P,
+                                                        const float3 &dir,
+                                                        uint visibility,
+                                                        int object,
+                                                        int curveAddr,
+                                                        float time,
+                                                        int type,
+                                                        uint *lcg_state,
+                                                        float difl,
+                                                        float extmax)
 #else
-ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
+ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
+                                                        Intersection *isect,
+                                                        float3 P,
+                                                        float3 dir,
+                                                        uint visibility,
+                                                        int object,
+                                                        int curveAddr,
+                                                        float time,
+                                                        int type,
+                                                        uint *lcg_state,
+                                                        float difl,
+                                                        float extmax)
 #endif
 {
 	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
@@ -694,8 +714,18 @@ ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, I
 	return hit;
 }
 
-ccl_device_curveintersect bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
+ccl_device_curveintersect bool curve_intersect(KernelGlobals *kg,
+                                               Intersection *isect,
+                                               float3 P,
+                                               float3 direction,
+                                               uint visibility,
+                                               int object,
+                                               int curveAddr,
+                                               float time,
+                                               int type,
+                                               uint *lcg_state,
+                                               float difl,
+                                               float extmax)
 {
 	/* define few macros to minimize code duplication for SSE */
 #ifndef __KERNEL_SSE2__
@@ -963,7 +993,10 @@ ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, fl
 	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
 }
 
-ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
+ccl_device_inline float3 curve_refine(KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      const Intersection *isect,
+                                      const Ray *ray)
 {
 	int flag = kernel_data.curve.curveflags;
 	float t = isect->t;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index c66f52255f0..90b936d83c9 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -83,7 +83,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
 
 		sd->shader = __float_as_int(curvedata.z);
-		sd->P = bvh_curve_refine(kg, sd, isect, ray);
+		sd->P = curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
-- 
cgit v1.2.3


From 77a7a7f455ae19d9535e1811ece7eb550c8f303b Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Mon, 7 Aug 2017 19:48:49 +0200
Subject: Cycles: Cleanup, trailign whitespace

---
 intern/cycles/kernel/geom/geom_curve.h | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 29b357e33d6..b3ee0343a1e 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -151,7 +151,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 /* Curve tangent normal */
 
 ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
-{	
+{
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
 	if(sd->type & PRIMITIVE_ALL_CURVE) {
@@ -276,7 +276,7 @@ ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
 	ssef vdir = load4f(dir);
 	ssef vcurve_coef[4];
 	const float3 *curve_coef = (float3 *)vcurve_coef;
-	
+
 	{
 		ssef dtmp = vdir * vdir;
 		ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
@@ -477,7 +477,7 @@ ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
 #else
 		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
 		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
-		
+
 		float bminx = min(p_st.x, p_en.x);
 		float bmaxx = max(p_st.x, p_en.x);
 		float bminy = min(p_st.y, p_en.y);
@@ -598,7 +598,7 @@ ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
 					else // inside
 						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
 				}
-				
+
 				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
 					tree++;
 					level = tree & -tree;
@@ -646,7 +646,7 @@ ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
 					level = tree & -tree;
 					continue;
 				}
-				
+
 				float rootd = sqrtf(td);
 				float correction = (-tb - rootd) * 0.5f * invcyla;
 				t = tcentre + correction;
@@ -661,7 +661,7 @@ ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
 				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
 					correction = (-tb + rootd) * 0.5f * invcyla;
 					t = tcentre + correction;
-				}			
+				}
 
 				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
 					tree++;
@@ -701,7 +701,7 @@ ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
 				isect->type = type;
 				hit = true;
 			}
-			
+
 			tree++;
 			level = tree & -tree;
 		}
@@ -791,7 +791,7 @@ ccl_device_curveintersect bool curve_intersect(KernelGlobals *kg,
 	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
 #else
 	ssef P_curve[2];
-	
+
 	if(is_curve_primitive) {
 		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
 		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
@@ -1059,7 +1059,7 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg,
 		}
 		else {
 			/* direction from inside to surface of curve */
-			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
+			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
 			sd->Ng = normalize(P - p_curr);
 
 			/* adjustment for changing radius */
@@ -1089,7 +1089,7 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg,
 
 		float l = 1.0f;
 		tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
-		
+
 		P = P + D*t;
 
 		float3 dif = P - float4_to_float3(P_curve[0]);
@@ -1141,4 +1141,3 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg,
 #endif
 
 CCL_NAMESPACE_END
-
-- 
cgit v1.2.3


From 451ccf739611ebea82f4e6fb05c3640bd330f8b6 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Mon, 7 Aug 2017 19:53:12 +0200
Subject: Cycles: Cleanup, move curve intersection functions to own file

This way curve file becomes much shorter and it's also easier to write a
benchmark application to check performance before/after future changes.
---
 intern/cycles/kernel/CMakeLists.txt              |   1 +
 intern/cycles/kernel/geom/geom.h                 |   1 +
 intern/cycles/kernel/geom/geom_curve.h           | 934 +---------------------
 intern/cycles/kernel/geom/geom_curve_intersect.h | 948 +++++++++++++++++++++++
 4 files changed, 955 insertions(+), 929 deletions(-)
 create mode 100644 intern/cycles/kernel/geom/geom_curve_intersect.h

diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 88c4c4e3282..9fe61515570 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -202,6 +202,7 @@ set(SRC_GEOM_HEADERS
 	geom/geom.h
 	geom/geom_attribute.h
 	geom/geom_curve.h
+	geom/geom_curve_intersect.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
 	geom/geom_motion_triangle_intersect.h
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index c623e3490fd..f34b77ebc07 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -27,6 +27,7 @@
 #include "kernel/geom/geom_motion_triangle_shader.h"
 #include "kernel/geom/geom_motion_curve.h"
 #include "kernel/geom/geom_curve.h"
+#include "kernel/geom/geom_curve_intersect.h"
 #include "kernel/geom/geom_volume.h"
 #include "kernel/geom/geom_primitive.h"
 
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index b3ee0343a1e..e35267f02bf 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -16,18 +16,13 @@ CCL_NAMESPACE_BEGIN
 
 /* Curve Primitive
  *
- * Curve primitive for rendering hair and fur. These can be render as flat ribbons
- * or curves with actual thickness. The curve can also be rendered as line segments
- * rather than curves for better performance */
+ * Curve primitive for rendering hair and fur. These can be render as flat
+ * ribbons or curves with actual thickness. The curve can also be rendered as
+ * line segments rather than curves for better performance.
+ */
 
 #ifdef __HAIR__
 
-#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
-#  define ccl_device_curveintersect ccl_device
-#else
-#  define ccl_device_curveintersect ccl_device_forceinline
-#endif
-
 /* Reading attributes on various curve elements */
 
 ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
@@ -219,925 +214,6 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta,
 	}
 }
 
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
-{
-	return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
-}
-#endif
-
-#ifdef __KERNEL_SSE2__
-/* Pass P and dir by reference to aligned vector */
-ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
-                                                        Intersection *isect,
-                                                        const float3 &P,
-                                                        const float3 &dir,
-                                                        uint visibility,
-                                                        int object,
-                                                        int curveAddr,
-                                                        float time,
-                                                        int type,
-                                                        uint *lcg_state,
-                                                        float difl,
-                                                        float extmax)
-#else
-ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
-                                                        Intersection *isect,
-                                                        float3 P,
-                                                        float3 dir,
-                                                        uint visibility,
-                                                        int object,
-                                                        int curveAddr,
-                                                        float time,
-                                                        int type,
-                                                        uint *lcg_state,
-                                                        float difl,
-                                                        float extmax)
-#endif
-{
-	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
-
-	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
-		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
-		if(time < prim_time.x || time > prim_time.y) {
-			return false;
-		}
-	}
-
-	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-	float epsilon = 0.0f;
-	float r_st, r_en;
-
-	int depth = kernel_data.curve.subdivisions;
-	int flags = kernel_data.curve.curveflags;
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-
-#ifdef __KERNEL_SSE2__
-	ssef vdir = load4f(dir);
-	ssef vcurve_coef[4];
-	const float3 *curve_coef = (float3 *)vcurve_coef;
-
-	{
-		ssef dtmp = vdir * vdir;
-		ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
-		ssef rd_ss = load1f_first(1.0f) / d_ss;
-
-		ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
-		int2 &v00 = (int2 &)v00vec;
-
-		int k0 = v00.x + segment;
-		int k1 = k0 + 1;
-		int ka = max(k0 - 1, v00.x);
-		int kb = min(k1 + 1, v00.x + v00.y - 1);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
-		avxf P_curve_0_1, P_curve_2_3;
-		if(is_curve_primitive) {
-			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
-			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-			motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
-		}
-#else  /* __KERNEL_AVX2__ */
-		ssef P_curve[4];
-
-		if(is_curve_primitive) {
-			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
-			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
-			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
-			P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
-		}
-#endif  /* __KERNEL_AVX2__ */
-
-		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
-		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
-		ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
-		ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
-		ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
-
-		ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
-		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
-		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
-		const avxf vPP = _mm256_broadcast_ps(&P.m128);
-		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
-		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
-		const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
-
-		const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
-		                      htfm00,
-		                      madd(shuffle<1>(P_curve_0_1 - vPP),
-		                           htfm11,
-		                           shuffle<2>(P_curve_0_1 - vPP) * htfm22));
-		const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
-		                      htfm00,
-		                      madd(shuffle<1>(P_curve_2_3 - vPP),
-		                           htfm11,
-		                           shuffle<2>(P_curve_2_3 - vPP)*htfm22));
-
-		const ssef p0 = _mm256_castps256_ps128(p01);
-		const ssef p1 = _mm256_extractf128_ps(p01, 1);
-		const ssef p2 = _mm256_castps256_ps128(p23);
-		const ssef p3 = _mm256_extractf128_ps(p23, 1);
-
-		const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
-		r_st = ((float4 &)P_curve_1).w;
-		const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
-		r_en = ((float4 &)P_curve_2).w;
-#else  /* __KERNEL_AVX2__ */
-		ssef htfm[] = { htfm0, htfm1, htfm2 };
-		ssef vP = load4f(P);
-		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
-		ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
-		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
-		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
-
-		r_st = ((float4 &)P_curve[1]).w;
-		r_en = ((float4 &)P_curve[2]).w;
-#endif  /* __KERNEL_AVX2__ */
-
-		float fc = 0.71f;
-		ssef vfc = ssef(fc);
-		ssef vfcxp3 = vfc * p3;
-
-		vcurve_coef[0] = p1;
-		vcurve_coef[1] = vfc * (p2 - p0);
-		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
-		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
-
-	}
-#else
-	float3 curve_coef[4];
-
-	/* curve Intersection check */
-	/* obtain curve parameters */
-	{
-		/* ray transform created - this should be created at beginning of intersection loop */
-		Transform htfm;
-		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
-		htfm = make_transform(
-			dir.z / d, 0, -dir.x /d, 0,
-			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
-			dir.x, dir.y, dir.z, 0,
-			0, 0, 0, 1);
-
-		float4 v00 = kernel_tex_fetch(__curves, prim);
-
-		int k0 = __float_as_int(v00.x) + segment;
-		int k1 = k0 + 1;
-
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P_curve[4];
-
-		if(is_curve_primitive) {
-			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
-		}
-
-		float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
-		float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
-		float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
-		float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
-
-		float fc = 0.71f;
-		curve_coef[0] = p1;
-		curve_coef[1] = -fc*p0 + fc*p2;
-		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
-		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
-		r_st = P_curve[1].w;
-		r_en = P_curve[2].w;
-	}
-#endif
-
-	float r_curr = max(r_st, r_en);
-
-	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
-		epsilon = 2 * r_curr;
-
-	/* find bounds - this is slow for cubic curves */
-	float upper, lower;
-
-	float zextrem[4];
-	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
-	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
-		return false;
-
-	/* minimum width extension */
-	float mw_extension = min(difl * fabsf(upper), extmax);
-	float r_ext = mw_extension + r_curr;
-
-	float xextrem[4];
-	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	float yextrem[4];
-	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	/* setup recurrent loop */
-	int level = 1 << depth;
-	int tree = 0;
-	float resol = 1.0f / (float)level;
-	bool hit = false;
-
-	/* begin loop */
-	while(!(tree >> (depth))) {
-		const float i_st = tree * resol;
-		const float i_en = i_st + (level * resol);
-
-#ifdef __KERNEL_SSE2__
-		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
-		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
-		ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
-
-		ssef vbmin = min(vp_st, vp_en);
-		ssef vbmax = max(vp_st, vp_en);
-
-		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
-		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
-		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
-		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
-#else
-		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
-		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
-
-		float bminx = min(p_st.x, p_en.x);
-		float bmaxx = max(p_st.x, p_en.x);
-		float bminy = min(p_st.y, p_en.y);
-		float bmaxy = max(p_st.y, p_en.y);
-		float bminz = min(p_st.z, p_en.z);
-		float bmaxz = max(p_st.z, p_en.z);
-#endif
-
-		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
-			bminx = min(bminx,xextrem[1]);
-			bmaxx = max(bmaxx,xextrem[1]);
-		}
-		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
-			bminx = min(bminx,xextrem[3]);
-			bmaxx = max(bmaxx,xextrem[3]);
-		}
-		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
-			bminy = min(bminy,yextrem[1]);
-			bmaxy = max(bmaxy,yextrem[1]);
-		}
-		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
-			bminy = min(bminy,yextrem[3]);
-			bmaxy = max(bmaxy,yextrem[3]);
-		}
-		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
-			bminz = min(bminz,zextrem[1]);
-			bmaxz = max(bmaxz,zextrem[1]);
-		}
-		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
-			bminz = min(bminz,zextrem[3]);
-			bmaxz = max(bmaxz,zextrem[3]);
-		}
-
-		float r1 = r_st + (r_en - r_st) * i_st;
-		float r2 = r_st + (r_en - r_st) * i_en;
-		r_curr = max(r1, r2);
-
-		mw_extension = min(difl * fabsf(bmaxz), extmax);
-		float r_ext = mw_extension + r_curr;
-		float coverage = 1.0f;
-
-		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
-			/* the bounding box does not overlap the square centered at O */
-			tree += level;
-			level = tree & -tree;
-		}
-		else if(level == 1) {
-
-			/* the maximum recursion depth is reached.
-			 * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
-			 * dP* is reversed if necessary.*/
-			float t = isect->t;
-			float u = 0.0f;
-			float gd = 0.0f;
-
-			if(flags & CURVE_KN_RIBBONS) {
-				float3 tg = (p_en - p_st);
-#ifdef __KERNEL_SSE__
-				const float3 tg_sq = tg * tg;
-				float w = tg_sq.x + tg_sq.y;
-#else
-				float w = tg.x * tg.x + tg.y * tg.y;
-#endif
-				if(w == 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-#ifdef __KERNEL_SSE__
-				const float3 p_sttg = p_st * tg;
-				w = -(p_sttg.x + p_sttg.y) / w;
-#else
-				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-#endif
-				w = saturate(w);
-
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-				r_curr = r_st + (r_en - r_st) * u;
-				/* compare x-y distances */
-				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if(dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if(dot(tg, dp_en) < 0)
-					dp_en *= -1;
-				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				/* compute coverage */
-				float r_ext = r_curr;
-				coverage = 1.0f;
-				if(difl != 0.0f) {
-					mw_extension = min(difl * fabsf(bmaxz), extmax);
-					r_ext = mw_extension + r_curr;
-#ifdef __KERNEL_SSE__
-					const float3 p_curr_sq = p_curr * p_curr;
-					const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
-					float d = dxxx.x;
-#else
-					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
-#endif
-					float d0 = d - r_curr;
-					float d1 = d + r_curr;
-					float inv_mw_extension = 1.0f/mw_extension;
-					if(d0 >= 0)
-						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
-					else // inside
-						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
-				}
-
-				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				t = p_curr.z;
-
-				/* stochastic fade from minimum width */
-				if(difl != 0.0f && lcg_state) {
-					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
-						return hit;
-				}
-			}
-			else {
-				float l = len(p_en - p_st);
-				/* minimum width extension */
-				float or1 = r1;
-				float or2 = r2;
-
-				if(difl != 0.0f) {
-					mw_extension = min(len(p_st - P) * difl, extmax);
-					or1 = r1 < mw_extension ? mw_extension : r1;
-					mw_extension = min(len(p_en - P) * difl, extmax);
-					or2 = r2 < mw_extension ? mw_extension : r2;
-				}
-				/* --- */
-				float invl = 1.0f/l;
-				float3 tg = (p_en - p_st) * invl;
-				gd = (or2 - or1) * invl;
-				float difz = -dot(p_st,tg);
-				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
-				float invcyla = 1.0f/cyla;
-				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
-				float tcentre = -halfb*invcyla;
-				float zcentre = difz + (tg.z * tcentre);
-				float3 tdif = - p_st;
-				tdif.z += tcentre;
-				float tdifz = dot(tdif,tg);
-				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
-				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
-				float td = tb*tb - 4*cyla*tc;
-				if(td < 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				float rootd = sqrtf(td);
-				float correction = (-tb - rootd) * 0.5f * invcyla;
-				t = tcentre + correction;
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if(dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if(dot(tg, dp_en) < 0)
-					dp_en *= -1;
-
-				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
-					correction = (-tb + rootd) * 0.5f * invcyla;
-					t = tcentre + correction;
-				}
-
-				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				float w = (zcentre + (tg.z * correction)) * invl;
-				w = saturate(w);
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-
-				/* stochastic fade from minimum width */
-				if(difl != 0.0f && lcg_state) {
-					r_curr = r1 + (r2 - r1) * w;
-					r_ext = or1 + (or2 - or1) * w;
-					coverage = r_curr/r_ext;
-
-					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
-						return hit;
-				}
-			}
-			/* we found a new intersection */
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->t = t;
-				isect->u = u;
-				isect->v = gd;
-				isect->prim = curveAddr;
-				isect->object = object;
-				isect->type = type;
-				hit = true;
-			}
-
-			tree++;
-			level = tree & -tree;
-		}
-		else {
-			/* split the curve into two curves and process */
-			level = level >> 1;
-		}
-	}
-
-	return hit;
-}
-
-ccl_device_curveintersect bool curve_intersect(KernelGlobals *kg,
-                                               Intersection *isect,
-                                               float3 P,
-                                               float3 direction,
-                                               uint visibility,
-                                               int object,
-                                               int curveAddr,
-                                               float time,
-                                               int type,
-                                               uint *lcg_state,
-                                               float difl,
-                                               float extmax)
-{
-	/* define few macros to minimize code duplication for SSE */
-#ifndef __KERNEL_SSE2__
-#  define len3_squared(x) len_squared(x)
-#  define len3(x) len(x)
-#  define dot3(x, y) dot(x, y)
-#endif
-
-	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
-
-	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
-		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
-		if(time < prim_time.x || time > prim_time.y) {
-			return false;
-		}
-	}
-
-	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-	/* curve Intersection check */
-	int flags = kernel_data.curve.curveflags;
-
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int cnum = __float_as_int(v00.x);
-	int k0 = cnum + segment;
-	int k1 = k0 + 1;
-
-#ifndef __KERNEL_SSE2__
-	float4 P_curve[2];
-
-	if(is_curve_primitive) {
-		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
-		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
-	}
-	else {
-		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-		motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
-	}
-
-	float or1 = P_curve[0].w;
-	float or2 = P_curve[1].w;
-	float3 p1 = float4_to_float3(P_curve[0]);
-	float3 p2 = float4_to_float3(P_curve[1]);
-
-	/* minimum width extension */
-	float r1 = or1;
-	float r2 = or2;
-	float3 dif = P - p1;
-	float3 dif_second = P - p2;
-	if(difl != 0.0f) {
-		float pixelsize = min(len3(dif) * difl, extmax);
-		r1 = or1 < pixelsize ? pixelsize : or1;
-		pixelsize = min(len3(dif_second) * difl, extmax);
-		r2 = or2 < pixelsize ? pixelsize : or2;
-	}
-	/* --- */
-
-	float3 p21_diff = p2 - p1;
-	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
-	float3 dir = direction;
-	float sphere_b_tmp = dot3(dir, sphere_dif1);
-	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
-#else
-	ssef P_curve[2];
-
-	if(is_curve_primitive) {
-		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
-		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
-	}
-	else {
-		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-		motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
-	}
-
-	const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
-
-	ssef r12 = or12;
-	const ssef vP = load4f(P);
-	const ssef dif = vP - P_curve[0];
-	const ssef dif_second = vP - P_curve[1];
-	if(difl != 0.0f) {
-		const ssef len1_sq = len3_squared_splat(dif);
-		const ssef len2_sq = len3_squared_splat(dif_second);
-		const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
-		const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
-		r12 = max(or12, pixelsize12);
-	}
-	float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
-	float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
-
-	const ssef p21_diff = P_curve[1] - P_curve[0];
-	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
-	const ssef dir = load4f(direction);
-	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
-	const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
-#endif
-
-	float mr = max(r1, r2);
-	float l = len3(p21_diff);
-	float invl = 1.0f / l;
-	float sp_r = mr + 0.5f * l;
-
-	float sphere_b = dot3(dir, sphere_dif2);
-	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
-
-	if(sdisc < 0.0f)
-		return false;
-
-	/* obtain parameters and test midpoint distance for suitable modes */
-#ifndef __KERNEL_SSE2__
-	float3 tg = p21_diff * invl;
-#else
-	const ssef tg = p21_diff * invl;
-#endif
-	float gd = (r2 - r1) * invl;
-
-	float dirz = dot3(dir, tg);
-	float difz = dot3(dif, tg);
-
-	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
-
-	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
-
-	float tcentre = -halfb/a;
-	float zcentre = difz + (dirz * tcentre);
-
-	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
-		return false;
-	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
-		return false;
-
-	/* test minimum separation */
-#ifndef __KERNEL_SSE2__
-	float3 cprod = cross(tg, dir);
-	float cprod2sq = len3_squared(cross(tg, dif));
-#else
-	const ssef cprod = cross(tg, dir);
-	float cprod2sq = len3_squared(cross_zxy(tg, dif));
-#endif
-	float cprodsq = len3_squared(cprod);
-	float distscaled = dot3(cprod, dif);
-
-	if(cprodsq == 0)
-		distscaled = cprod2sq;
-	else
-		distscaled = (distscaled*distscaled)/cprodsq;
-
-	if(distscaled > mr*mr)
-		return false;
-
-	/* calculate true intersection */
-#ifndef __KERNEL_SSE2__
-	float3 tdif = dif + tcentre * dir;
-#else
-	const ssef tdif = madd(ssef(tcentre), dir, dif);
-#endif
-	float tdifz = dot3(tdif, tg);
-	float tdifma = tdifz*gd + r1;
-	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
-	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
-	float td = tb*tb - 4*a*tc;
-
-	if(td < 0.0f)
-		return false;
-
-	float rootd = 0.0f;
-	float correction = 0.0f;
-	if(flags & CURVE_KN_ACCURATE) {
-		rootd = sqrtf(td);
-		correction = ((-tb - rootd)/(2*a));
-	}
-
-	float t = tcentre + correction;
-
-	if(t < isect->t) {
-
-		if(flags & CURVE_KN_INTERSECTCORRECTION) {
-			rootd = sqrtf(td);
-			correction = ((-tb - rootd)/(2*a));
-			t = tcentre + correction;
-		}
-
-		float z = zcentre + (dirz * correction);
-		// bool backface = false;
-
-		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
-			// backface = true;
-			correction = ((-tb + rootd)/(2*a));
-			t = tcentre + correction;
-			z = zcentre + (dirz * correction);
-		}
-
-		/* stochastic fade from minimum width */
-		float adjradius = or1 + z * (or2 - or1) * invl;
-		adjradius = adjradius / (r1 + z * gd);
-		if(lcg_state && adjradius != 1.0f) {
-			if(lcg_step_float(lcg_state) > adjradius)
-				return false;
-		}
-		/* --- */
-
-		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
-
-			if(flags & CURVE_KN_ENCLOSEFILTER) {
-				float enc_ratio = 1.01f;
-				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
-					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
-					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
-					if(a2*c2 < 0.0f)
-						return false;
-				}
-			}
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->t = t;
-				isect->u = z*invl;
-				isect->v = gd;
-				isect->prim = curveAddr;
-				isect->object = object;
-				isect->type = type;
-
-				return true;
-			}
-		}
-	}
-
-	return false;
-
-#ifndef __KERNEL_SSE2__
-#  undef len3_squared
-#  undef len3
-#  undef dot3
-#endif
-}
-
-ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float fc = 0.71f;
-	float data[4];
-	float t2 = t * t;
-	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
-	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
-	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
-	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float data[4];
-	float fc = 0.71f;
-	float t2 = t * t;
-	float t3 = t2 * t;
-	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
-	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
-	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
-	data[3] =  fc          * t3  - fc * t2;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 curve_refine(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      const Intersection *isect,
-                                      const Ray *ray)
-{
-	int flag = kernel_data.curve.curveflags;
-	float t = isect->t;
-	float3 P = ray->P;
-	float3 D = ray->D;
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	int prim = kernel_tex_fetch(__prim_index, isect->prim);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
-	int k1 = k0 + 1;
-
-	float3 tg;
-
-	if(flag & CURVE_KN_INTERPOLATE) {
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P_curve[4];
-
-		if(sd->type & PRIMITIVE_CURVE) {
-			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-		}
-		else {
-			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
-		}
-
-		float3 p[4];
-		p[0] = float4_to_float3(P_curve[0]);
-		p[1] = float4_to_float3(P_curve[1]);
-		p[2] = float4_to_float3(P_curve[2]);
-		p[3] = float4_to_float3(P_curve[3]);
-
-		P = P + D*t;
-
-#ifdef __UV__
-		sd->u = isect->u;
-		sd->v = 0.0f;
-#endif
-
-		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
-
-		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
-		}
-		else {
-			/* direction from inside to surface of curve */
-			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
-			sd->Ng = normalize(P - p_curr);
-
-			/* adjustment for changing radius */
-			float gd = isect->v;
-
-			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
-			}
-		}
-
-		/* todo: sometimes the normal is still so that this is detected as
-		 * backfacing even if cull backfaces is enabled */
-
-		sd->N = sd->Ng;
-	}
-	else {
-		float4 P_curve[2];
-
-		if(sd->type & PRIMITIVE_CURVE) {
-			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
-			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
-		}
-		else {
-			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
-		}
-
-		float l = 1.0f;
-		tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
-
-		P = P + D*t;
-
-		float3 dif = P - float4_to_float3(P_curve[0]);
-
-#ifdef __UV__
-		sd->u = dot(dif,tg)/l;
-		sd->v = 0.0f;
-#endif
-
-		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			sd->Ng = -(D - tg * dot(tg, D));
-			sd->Ng = normalize(sd->Ng);
-		}
-		else {
-			float gd = isect->v;
-
-			/* direction from inside to surface of curve */
-			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
-
-			/* adjustment for changing radius */
-			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
-			}
-		}
-
-		sd->N = sd->Ng;
-	}
-
-#ifdef __DPDU__
-	/* dPdu/dPdv */
-	sd->dPdu = tg;
-	sd->dPdv = cross(tg, sd->Ng);
-#endif
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-}
-
-#endif
+#endif  /* __HAIR__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
new file mode 100644
index 00000000000..8378e002506
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -0,0 +1,948 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Curve primitive intersection functions. */
+
+#ifdef __HAIR__
+
+#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
+#  define ccl_device_curveintersect ccl_device
+#else
+#  define ccl_device_curveintersect ccl_device_forceinline
+#endif
+
+#ifdef __KERNEL_SSE2__
+ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
+{
+	return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
+}
+#endif
+
+#ifdef __KERNEL_SSE2__
+/* Pass P and dir by reference to aligned vector */
+ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
+                                                        Intersection *isect,
+                                                        const float3 &P,
+                                                        const float3 &dir,
+                                                        uint visibility,
+                                                        int object,
+                                                        int curveAddr,
+                                                        float time,
+                                                        int type,
+                                                        uint *lcg_state,
+                                                        float difl,
+                                                        float extmax)
+#else
+ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
+                                                        Intersection *isect,
+                                                        float3 P,
+                                                        float3 dir,
+                                                        uint visibility,
+                                                        int object,
+                                                        int curveAddr,
+                                                        float time,
+                                                        int type,
+                                                        uint *lcg_state,
+                                                        float difl,
+                                                        float extmax)
+#endif
+{
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
+	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+	float epsilon = 0.0f;
+	float r_st, r_en;
+
+	int depth = kernel_data.curve.subdivisions;
+	int flags = kernel_data.curve.curveflags;
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+
+#ifdef __KERNEL_SSE2__
+	ssef vdir = load4f(dir);
+	ssef vcurve_coef[4];
+	const float3 *curve_coef = (float3 *)vcurve_coef;
+
+	{
+		ssef dtmp = vdir * vdir;
+		ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
+		ssef rd_ss = load1f_first(1.0f) / d_ss;
+
+		ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
+		int2 &v00 = (int2 &)v00vec;
+
+		int k0 = v00.x + segment;
+		int k1 = k0 + 1;
+		int ka = max(k0 - 1, v00.x);
+		int kb = min(k1 + 1, v00.x + v00.y - 1);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
+		avxf P_curve_0_1, P_curve_2_3;
+		if(is_curve_primitive) {
+			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
+			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
+			motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
+		}
+#else  /* __KERNEL_AVX2__ */
+		ssef P_curve[4];
+
+		if(is_curve_primitive) {
+			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
+			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
+			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
+			P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
+		}
+#endif  /* __KERNEL_AVX2__ */
+
+		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
+		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
+		ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
+		ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
+		ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+
+		ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
+		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
+		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
+		const avxf vPP = _mm256_broadcast_ps(&P.m128);
+		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
+		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
+		const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
+
+		const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
+		                      htfm00,
+		                      madd(shuffle<1>(P_curve_0_1 - vPP),
+		                           htfm11,
+		                           shuffle<2>(P_curve_0_1 - vPP) * htfm22));
+		const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
+		                      htfm00,
+		                      madd(shuffle<1>(P_curve_2_3 - vPP),
+		                           htfm11,
+		                           shuffle<2>(P_curve_2_3 - vPP)*htfm22));
+
+		const ssef p0 = _mm256_castps256_ps128(p01);
+		const ssef p1 = _mm256_extractf128_ps(p01, 1);
+		const ssef p2 = _mm256_castps256_ps128(p23);
+		const ssef p3 = _mm256_extractf128_ps(p23, 1);
+
+		const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
+		r_st = ((float4 &)P_curve_1).w;
+		const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
+		r_en = ((float4 &)P_curve_2).w;
+#else  /* __KERNEL_AVX2__ */
+		ssef htfm[] = { htfm0, htfm1, htfm2 };
+		ssef vP = load4f(P);
+		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
+		ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
+		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
+		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
+
+		r_st = ((float4 &)P_curve[1]).w;
+		r_en = ((float4 &)P_curve[2]).w;
+#endif  /* __KERNEL_AVX2__ */
+
+		float fc = 0.71f;
+		ssef vfc = ssef(fc);
+		ssef vfcxp3 = vfc * p3;
+
+		vcurve_coef[0] = p1;
+		vcurve_coef[1] = vfc * (p2 - p0);
+		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
+		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
+
+	}
+#else
+	float3 curve_coef[4];
+
+	/* curve Intersection check */
+	/* obtain curve parameters */
+	{
+		/* ray transform created - this should be created at beginning of intersection loop */
+		Transform htfm;
+		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
+		htfm = make_transform(
+			dir.z / d, 0, -dir.x /d, 0,
+			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
+			dir.x, dir.y, dir.z, 0,
+			0, 0, 0, 1);
+
+		float4 v00 = kernel_tex_fetch(__curves, prim);
+
+		int k0 = __float_as_int(v00.x) + segment;
+		int k1 = k0 + 1;
+
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P_curve[4];
+
+		if(is_curve_primitive) {
+			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
+		}
+
+		float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
+		float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
+		float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
+		float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
+
+		float fc = 0.71f;
+		curve_coef[0] = p1;
+		curve_coef[1] = -fc*p0 + fc*p2;
+		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
+		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
+		r_st = P_curve[1].w;
+		r_en = P_curve[2].w;
+	}
+#endif
+
+	float r_curr = max(r_st, r_en);
+
+	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
+		epsilon = 2 * r_curr;
+
+	/* find bounds - this is slow for cubic curves */
+	float upper, lower;
+
+	float zextrem[4];
+	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
+	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
+		return false;
+
+	/* minimum width extension */
+	float mw_extension = min(difl * fabsf(upper), extmax);
+	float r_ext = mw_extension + r_curr;
+
+	float xextrem[4];
+	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	float yextrem[4];
+	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	/* setup recurrent loop */
+	int level = 1 << depth;
+	int tree = 0;
+	float resol = 1.0f / (float)level;
+	bool hit = false;
+
+	/* begin loop */
+	while(!(tree >> (depth))) {
+		const float i_st = tree * resol;
+		const float i_en = i_st + (level * resol);
+
+#ifdef __KERNEL_SSE2__
+		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
+		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
+		ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
+
+		ssef vbmin = min(vp_st, vp_en);
+		ssef vbmax = max(vp_st, vp_en);
+
+		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
+		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
+		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
+		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
+#else
+		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
+		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
+
+		float bminx = min(p_st.x, p_en.x);
+		float bmaxx = max(p_st.x, p_en.x);
+		float bminy = min(p_st.y, p_en.y);
+		float bmaxy = max(p_st.y, p_en.y);
+		float bminz = min(p_st.z, p_en.z);
+		float bmaxz = max(p_st.z, p_en.z);
+#endif
+
+		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
+			bminx = min(bminx,xextrem[1]);
+			bmaxx = max(bmaxx,xextrem[1]);
+		}
+		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
+			bminx = min(bminx,xextrem[3]);
+			bmaxx = max(bmaxx,xextrem[3]);
+		}
+		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
+			bminy = min(bminy,yextrem[1]);
+			bmaxy = max(bmaxy,yextrem[1]);
+		}
+		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
+			bminy = min(bminy,yextrem[3]);
+			bmaxy = max(bmaxy,yextrem[3]);
+		}
+		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
+			bminz = min(bminz,zextrem[1]);
+			bmaxz = max(bmaxz,zextrem[1]);
+		}
+		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
+			bminz = min(bminz,zextrem[3]);
+			bmaxz = max(bmaxz,zextrem[3]);
+		}
+
+		float r1 = r_st + (r_en - r_st) * i_st;
+		float r2 = r_st + (r_en - r_st) * i_en;
+		r_curr = max(r1, r2);
+
+		mw_extension = min(difl * fabsf(bmaxz), extmax);
+		float r_ext = mw_extension + r_curr;
+		float coverage = 1.0f;
+
+		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
+			/* the bounding box does not overlap the square centered at O */
+			tree += level;
+			level = tree & -tree;
+		}
+		else if(level == 1) {
+
+			/* the maximum recursion depth is reached.
+			 * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
+			 * dP* is reversed if necessary.*/
+			float t = isect->t;
+			float u = 0.0f;
+			float gd = 0.0f;
+
+			if(flags & CURVE_KN_RIBBONS) {
+				float3 tg = (p_en - p_st);
+#ifdef __KERNEL_SSE__
+				const float3 tg_sq = tg * tg;
+				float w = tg_sq.x + tg_sq.y;
+#else
+				float w = tg.x * tg.x + tg.y * tg.y;
+#endif
+				if(w == 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+#ifdef __KERNEL_SSE__
+				const float3 p_sttg = p_st * tg;
+				w = -(p_sttg.x + p_sttg.y) / w;
+#else
+				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
+#endif
+				w = saturate(w);
+
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+				r_curr = r_st + (r_en - r_st) * u;
+				/* compare x-y distances */
+				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if(dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if(dot(tg, dp_en) < 0)
+					dp_en *= -1;
+				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				/* compute coverage */
+				float r_ext = r_curr;
+				coverage = 1.0f;
+				if(difl != 0.0f) {
+					mw_extension = min(difl * fabsf(bmaxz), extmax);
+					r_ext = mw_extension + r_curr;
+#ifdef __KERNEL_SSE__
+					const float3 p_curr_sq = p_curr * p_curr;
+					const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
+					float d = dxxx.x;
+#else
+					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
+#endif
+					float d0 = d - r_curr;
+					float d1 = d + r_curr;
+					float inv_mw_extension = 1.0f/mw_extension;
+					if(d0 >= 0)
+						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
+					else // inside
+						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
+				}
+
+				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				t = p_curr.z;
+
+				/* stochastic fade from minimum width */
+				if(difl != 0.0f && lcg_state) {
+					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+						return hit;
+				}
+			}
+			else {
+				float l = len(p_en - p_st);
+				/* minimum width extension */
+				float or1 = r1;
+				float or2 = r2;
+
+				if(difl != 0.0f) {
+					mw_extension = min(len(p_st - P) * difl, extmax);
+					or1 = r1 < mw_extension ? mw_extension : r1;
+					mw_extension = min(len(p_en - P) * difl, extmax);
+					or2 = r2 < mw_extension ? mw_extension : r2;
+				}
+				/* --- */
+				float invl = 1.0f/l;
+				float3 tg = (p_en - p_st) * invl;
+				gd = (or2 - or1) * invl;
+				float difz = -dot(p_st,tg);
+				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
+				float invcyla = 1.0f/cyla;
+				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
+				float tcentre = -halfb*invcyla;
+				float zcentre = difz + (tg.z * tcentre);
+				float3 tdif = - p_st;
+				tdif.z += tcentre;
+				float tdifz = dot(tdif,tg);
+				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
+				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
+				float td = tb*tb - 4*cyla*tc;
+				if(td < 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				float rootd = sqrtf(td);
+				float correction = (-tb - rootd) * 0.5f * invcyla;
+				t = tcentre + correction;
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if(dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if(dot(tg, dp_en) < 0)
+					dp_en *= -1;
+
+				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
+					correction = (-tb + rootd) * 0.5f * invcyla;
+					t = tcentre + correction;
+				}
+
+				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				float w = (zcentre + (tg.z * correction)) * invl;
+				w = saturate(w);
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+
+				/* stochastic fade from minimum width */
+				if(difl != 0.0f && lcg_state) {
+					r_curr = r1 + (r2 - r1) * w;
+					r_ext = or1 + (or2 - or1) * w;
+					coverage = r_curr/r_ext;
+
+					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+						return hit;
+				}
+			}
+			/* we found a new intersection */
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->t = t;
+				isect->u = u;
+				isect->v = gd;
+				isect->prim = curveAddr;
+				isect->object = object;
+				isect->type = type;
+				hit = true;
+			}
+
+			tree++;
+			level = tree & -tree;
+		}
+		else {
+			/* split the curve into two curves and process */
+			level = level >> 1;
+		}
+	}
+
+	return hit;
+}
+
+ccl_device_curveintersect bool curve_intersect(KernelGlobals *kg,
+                                               Intersection *isect,
+                                               float3 P,
+                                               float3 direction,
+                                               uint visibility,
+                                               int object,
+                                               int curveAddr,
+                                               float time,
+                                               int type,
+                                               uint *lcg_state,
+                                               float difl,
+                                               float extmax)
+{
+	/* define few macros to minimize code duplication for SSE */
+#ifndef __KERNEL_SSE2__
+#  define len3_squared(x) len_squared(x)
+#  define len3(x) len(x)
+#  define dot3(x, y) dot(x, y)
+#endif
+
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
+	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+	/* curve Intersection check */
+	int flags = kernel_data.curve.curveflags;
+
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int cnum = __float_as_int(v00.x);
+	int k0 = cnum + segment;
+	int k1 = k0 + 1;
+
+#ifndef __KERNEL_SSE2__
+	float4 P_curve[2];
+
+	if(is_curve_primitive) {
+		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
+		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
+	}
+	else {
+		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+		motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
+	}
+
+	float or1 = P_curve[0].w;
+	float or2 = P_curve[1].w;
+	float3 p1 = float4_to_float3(P_curve[0]);
+	float3 p2 = float4_to_float3(P_curve[1]);
+
+	/* minimum width extension */
+	float r1 = or1;
+	float r2 = or2;
+	float3 dif = P - p1;
+	float3 dif_second = P - p2;
+	if(difl != 0.0f) {
+		float pixelsize = min(len3(dif) * difl, extmax);
+		r1 = or1 < pixelsize ? pixelsize : or1;
+		pixelsize = min(len3(dif_second) * difl, extmax);
+		r2 = or2 < pixelsize ? pixelsize : or2;
+	}
+	/* --- */
+
+	float3 p21_diff = p2 - p1;
+	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
+	float3 dir = direction;
+	float sphere_b_tmp = dot3(dir, sphere_dif1);
+	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
+#else
+	ssef P_curve[2];
+
+	if(is_curve_primitive) {
+		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
+		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
+	}
+	else {
+		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+		motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
+	}
+
+	const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
+
+	ssef r12 = or12;
+	const ssef vP = load4f(P);
+	const ssef dif = vP - P_curve[0];
+	const ssef dif_second = vP - P_curve[1];
+	if(difl != 0.0f) {
+		const ssef len1_sq = len3_squared_splat(dif);
+		const ssef len2_sq = len3_squared_splat(dif_second);
+		const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
+		const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
+		r12 = max(or12, pixelsize12);
+	}
+	float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
+	float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
+
+	const ssef p21_diff = P_curve[1] - P_curve[0];
+	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
+	const ssef dir = load4f(direction);
+	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
+	const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
+#endif
+
+	float mr = max(r1, r2);
+	float l = len3(p21_diff);
+	float invl = 1.0f / l;
+	float sp_r = mr + 0.5f * l;
+
+	float sphere_b = dot3(dir, sphere_dif2);
+	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
+
+	if(sdisc < 0.0f)
+		return false;
+
+	/* obtain parameters and test midpoint distance for suitable modes */
+#ifndef __KERNEL_SSE2__
+	float3 tg = p21_diff * invl;
+#else
+	const ssef tg = p21_diff * invl;
+#endif
+	float gd = (r2 - r1) * invl;
+
+	float dirz = dot3(dir, tg);
+	float difz = dot3(dif, tg);
+
+	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
+
+	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
+
+	float tcentre = -halfb/a;
+	float zcentre = difz + (dirz * tcentre);
+
+	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
+		return false;
+	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
+		return false;
+
+	/* test minimum separation */
+#ifndef __KERNEL_SSE2__
+	float3 cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross(tg, dif));
+#else
+	const ssef cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross_zxy(tg, dif));
+#endif
+	float cprodsq = len3_squared(cprod);
+	float distscaled = dot3(cprod, dif);
+
+	if(cprodsq == 0)
+		distscaled = cprod2sq;
+	else
+		distscaled = (distscaled*distscaled)/cprodsq;
+
+	if(distscaled > mr*mr)
+		return false;
+
+	/* calculate true intersection */
+#ifndef __KERNEL_SSE2__
+	float3 tdif = dif + tcentre * dir;
+#else
+	const ssef tdif = madd(ssef(tcentre), dir, dif);
+#endif
+	float tdifz = dot3(tdif, tg);
+	float tdifma = tdifz*gd + r1;
+	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
+	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
+	float td = tb*tb - 4*a*tc;
+
+	if(td < 0.0f)
+		return false;
+
+	float rootd = 0.0f;
+	float correction = 0.0f;
+	if(flags & CURVE_KN_ACCURATE) {
+		rootd = sqrtf(td);
+		correction = ((-tb - rootd)/(2*a));
+	}
+
+	float t = tcentre + correction;
+
+	if(t < isect->t) {
+
+		if(flags & CURVE_KN_INTERSECTCORRECTION) {
+			rootd = sqrtf(td);
+			correction = ((-tb - rootd)/(2*a));
+			t = tcentre + correction;
+		}
+
+		float z = zcentre + (dirz * correction);
+		// bool backface = false;
+
+		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
+			// backface = true;
+			correction = ((-tb + rootd)/(2*a));
+			t = tcentre + correction;
+			z = zcentre + (dirz * correction);
+		}
+
+		/* stochastic fade from minimum width */
+		float adjradius = or1 + z * (or2 - or1) * invl;
+		adjradius = adjradius / (r1 + z * gd);
+		if(lcg_state && adjradius != 1.0f) {
+			if(lcg_step_float(lcg_state) > adjradius)
+				return false;
+		}
+		/* --- */
+
+		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
+
+			if(flags & CURVE_KN_ENCLOSEFILTER) {
+				float enc_ratio = 1.01f;
+				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
+					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
+					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
+					if(a2*c2 < 0.0f)
+						return false;
+				}
+			}
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->t = t;
+				isect->u = z*invl;
+				isect->v = gd;
+				isect->prim = curveAddr;
+				isect->object = object;
+				isect->type = type;
+
+				return true;
+			}
+		}
+	}
+
+	return false;
+
+#ifndef __KERNEL_SSE2__
+#  undef len3_squared
+#  undef len3
+#  undef dot3
+#endif
+}
+
+ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float fc = 0.71f;
+	float data[4];
+	float t2 = t * t;
+	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
+	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
+	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
+	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float data[4];
+	float fc = 0.71f;
+	float t2 = t * t;
+	float t3 = t2 * t;
+	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
+	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
+	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
+	data[3] =  fc          * t3  - fc * t2;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curve_refine(KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      const Intersection *isect,
+                                      const Ray *ray)
+{
+	int flag = kernel_data.curve.curveflags;
+	float t = isect->t;
+	float3 P = ray->P;
+	float3 D = ray->D;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	int prim = kernel_tex_fetch(__prim_index, isect->prim);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	int k1 = k0 + 1;
+
+	float3 tg;
+
+	if(flag & CURVE_KN_INTERPOLATE) {
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P_curve[4];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+		}
+		else {
+			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+		}
+
+		float3 p[4];
+		p[0] = float4_to_float3(P_curve[0]);
+		p[1] = float4_to_float3(P_curve[1]);
+		p[2] = float4_to_float3(P_curve[2]);
+		p[3] = float4_to_float3(P_curve[3]);
+
+		P = P + D*t;
+
+#ifdef __UV__
+		sd->u = isect->u;
+		sd->v = 0.0f;
+#endif
+
+		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+
+		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
+			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+		}
+		else {
+			/* direction from inside to surface of curve */
+			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
+			sd->Ng = normalize(P - p_curr);
+
+			/* adjustment for changing radius */
+			float gd = isect->v;
+
+			if(gd != 0.0f) {
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		/* todo: sometimes the normal is still so that this is detected as
+		 * backfacing even if cull backfaces is enabled */
+
+		sd->N = sd->Ng;
+	}
+	else {
+		float4 P_curve[2];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
+			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+		}
+		else {
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+		}
+
+		float l = 1.0f;
+		tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
+
+		P = P + D*t;
+
+		float3 dif = P - float4_to_float3(P_curve[0]);
+
+#ifdef __UV__
+		sd->u = dot(dif,tg)/l;
+		sd->v = 0.0f;
+#endif
+
+		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
+			sd->Ng = -(D - tg * dot(tg, D));
+			sd->Ng = normalize(sd->Ng);
+		}
+		else {
+			float gd = isect->v;
+
+			/* direction from inside to surface of curve */
+			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
+
+			/* adjustment for changing radius */
+			if(gd != 0.0f) {
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		sd->N = sd->Ng;
+	}
+
+#ifdef __DPDU__
+	/* dPdu/dPdv */
+	sd->dPdu = tg;
+	sd->dPdv = cross(tg, sd->Ng);
+#endif
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+}
+
+#endif
+
+CCL_NAMESPACE_END
-- 
cgit v1.2.3


From 8f6b5f1f3c2af33b7aba342eca221edd096d9a9e Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Tue, 8 Aug 2017 02:32:01 +0200
Subject: Fix CMake dependencies builder issues on Linux with TBB and Python
 packages.

---
 build_files/build_environment/cmake/harvest.cmake | 10 +++++++---
 build_files/build_environment/cmake/openvdb.cmake |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/build_files/build_environment/cmake/harvest.cmake b/build_files/build_environment/cmake/harvest.cmake
index 367bc7b45db..dfdfe50ab49 100644
--- a/build_files/build_environment/cmake/harvest.cmake
+++ b/build_files/build_environment/cmake/harvest.cmake
@@ -228,6 +228,7 @@ harvest(freetype/lib/libfreetype2ST.a freetype/lib/libfreetype.a)
 harvest(glew/include glew/include "*.h")
 harvest(glew/lib glew/lib "*.a")
 harvest(ilmbase openexr "*")
+harvest(ilmbase/include openexr/include "*.h")
 harvest(jemalloc/include jemalloc/include "*.h")
 harvest(jemalloc/lib jemalloc/lib "*.a")
 harvest(jpg/include jpeg/include "*.h")
@@ -266,14 +267,17 @@ harvest(png/include png/include "*.h")
 harvest(png/lib png/lib "*.a")
 harvest(python/bin python/bin "python${PYTHON_SHORT_VERSION}m")
 harvest(python/include python/include "*h")
-harvest(python/lib/libpython${PYTHON_SHORT_VERSION}m.a python/lib/python${PYTHON_SHORT_VERSION}/libpython${PYTHON_SHORT_VERSION}m.a)
 if(UNIX AND NOT APPLE)
+	harvest(python/lib/libpython${PYTHON_SHORT_VERSION}m.a python/lib/libpython${PYTHON_SHORT_VERSION}m.a)
 	harvest(python/lib/python${PYTHON_SHORT_VERSION} python/lib/python${PYTHON_SHORT_VERSION} "*")
+	harvest(requests python/lib/python${PYTHON_SHORT_VERSION}/site-packages/requests "*")
+	harvest(numpy python/lib/python${PYTHON_SHORT_VERSION}/site-packages/numpy "*")
 else()
+	harvest(python/lib/libpython${PYTHON_SHORT_VERSION}m.a python/lib/python${PYTHON_SHORT_VERSION}/libpython${PYTHON_SHORT_VERSION}m.a)
 	harvest(python/release release "*")
+	harvest(requests release/site-packages/requests "*")
+	harvest(numpy release/site-packages/numpy "*")
 endif()
-harvest(requests release/site-packages/requests "*")
-harvest(numpy release/site-packages/numpy "*")
 harvest(schroedinger/lib/libschroedinger-1.0.a ffmpeg/lib/libschroedinger.a)
 harvest(sdl/include/SDL2 sdl/include "*.h")
 harvest(sdl/lib sdl/lib "libSDL2.a")
diff --git a/build_files/build_environment/cmake/openvdb.cmake b/build_files/build_environment/cmake/openvdb.cmake
index bf9ad9ca410..a71598c1a3b 100644
--- a/build_files/build_environment/cmake/openvdb.cmake
+++ b/build_files/build_environment/cmake/openvdb.cmake
@@ -36,6 +36,7 @@ set(OPENVDB_EXTRA_ARGS
 	-DOPENEXR_INCLUDE_DIR=${LIBDIR}/openexr/include/
 	-DOPENEXR_ILMIMF_LIBRARIES=${LIBDIR}/openexr/lib/${LIBPREFIX}IlmImf-2_2${LIBEXT}
 	-DTBB_ROOT_DIR=${LIBDIR}/tbb/
+	-DTBB_INCLUDE_DIRS=${LIBDIR}/tbb/include
 	-DTBB_LIBRARY=${LIBDIR}/tbb/lib/tbb_static${LIBEXT}
 	-DBoost_COMPILER:STRING=${BOOST_COMPILER_STRING}
 	-DBoost_USE_MULTITHREADED=ON
-- 
cgit v1.2.3


From 6af7d7e05e31ef016b2d340a56311b3399f6814f Mon Sep 17 00:00:00 2001
From: Campbell Barton <ideasman42@gmail.com>
Date: Tue, 8 Aug 2017 11:28:47 +1000
Subject: Cleanup: remove incorrect comment

enum values aren't saved in files.
---
 source/blender/editors/include/UI_resources.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/source/blender/editors/include/UI_resources.h b/source/blender/editors/include/UI_resources.h
index f8a5f30a596..a0efd586af5 100644
--- a/source/blender/editors/include/UI_resources.h
+++ b/source/blender/editors/include/UI_resources.h
@@ -303,7 +303,6 @@ enum {
 	TH_EDGE_BEVEL,
 	TH_VERTEX_BEVEL
 };
-/* XXX WARNING: previous is saved in file, so do not change order! */
 
 /* specific defines per space should have higher define values */
 
-- 
cgit v1.2.3


From b53e35c655d40769e46cbe91929531fbe20d2977 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 8 Aug 2017 11:32:33 +0200
Subject: Fix compilation error when building without Blender

Simply disabled python tests, they can't be run anyway (since blender target is
not enabled) and we don't have any player-related tests in that folder.
---
 tests/CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index fa0b86a2637..64326f34377 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,7 +1,8 @@
 
 # Python CTests
-add_subdirectory(python)
+if(WITH_BLENDER)
+	add_subdirectory(python)
+endif()
 
 # GTest
 add_subdirectory(gtests)
-
-- 
cgit v1.2.3


From ec8ae4d5e9f735ab5aeb149dea8aa47ab8f8f977 Mon Sep 17 00:00:00 2001
From: Mai Lavelle <mai.lavelle@gmail.com>
Date: Tue, 8 Aug 2017 07:12:04 -0400
Subject: Cycles: Pack kernel textures into buffers for OpenCL

Image textures were being packed into a single buffer for OpenCL, which
limited the amount of memory available for images to the size of one
buffer (usually 4gb on AMD hardware). By packing textures into multiple
buffers that limit is removed, while simultaneously reducing the number
of buffers that need to be passed to each kernel.

Benchmarks were within 2%.

Fixes T51554.

Differential Revision: https://developer.blender.org/D2745
---
 intern/cycles/device/CMakeLists.txt                |   2 +
 intern/cycles/device/device.cpp                    |   2 -
 intern/cycles/device/device.h                      |   2 -
 intern/cycles/device/device_cpu.cpp                |   1 -
 intern/cycles/device/device_cuda.cpp               |   1 -
 intern/cycles/device/device_opencl.cpp             |   1 -
 intern/cycles/device/opencl/memory_manager.cpp     | 253 +++++++++++++++++++++
 intern/cycles/device/opencl/memory_manager.h       | 105 +++++++++
 intern/cycles/device/opencl/opencl.h               |  43 ++++
 intern/cycles/device/opencl/opencl_base.cpp        | 149 ++++++++++--
 intern/cycles/device/opencl/opencl_mega.cpp        |   5 +-
 intern/cycles/device/opencl/opencl_split.cpp       |  25 +-
 intern/cycles/kernel/kernel_compat_opencl.h        |   2 +-
 intern/cycles/kernel/kernel_globals.h              |  68 +++++-
 intern/cycles/kernel/kernel_image_opencl.h         |  66 ++++--
 intern/cycles/kernel/kernel_textures.h             |  11 +-
 intern/cycles/kernel/kernels/opencl/kernel.cl      |  45 ++--
 .../kernel/kernels/opencl/kernel_data_init.cl      |  11 +-
 .../kernel/kernels/opencl/kernel_split_function.h  |   9 +-
 intern/cycles/kernel/split/kernel_data_init.h      |   9 +-
 intern/cycles/render/image.cpp                     | 168 +-------------
 intern/cycles/render/image.h                       |  15 --
 intern/cycles/render/mesh.cpp                      |  11 +-
 intern/cycles/render/scene.cpp                     |   2 -
 intern/cycles/render/scene.h                       |   7 -
 25 files changed, 685 insertions(+), 328 deletions(-)
 create mode 100644 intern/cycles/device/opencl/memory_manager.cpp
 create mode 100644 intern/cycles/device/opencl/memory_manager.h

diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 74ec57ddf74..3c632160fbd 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -34,11 +34,13 @@ set(SRC
 
 set(SRC_OPENCL
 	opencl/opencl.h
+	opencl/memory_manager.h
 
 	opencl/opencl_base.cpp
 	opencl/opencl_mega.cpp
 	opencl/opencl_split.cpp
 	opencl/opencl_util.cpp
+	opencl/memory_manager.cpp
 )
 
 if(WITH_CYCLES_NETWORK)
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index a54bb77f9f3..f64436aec7b 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -379,11 +379,9 @@ DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices)
 	info.num = 0;
 
 	info.has_bindless_textures = true;
-	info.pack_images = false;
 	foreach(DeviceInfo &device, subdevices) {
 		assert(device.type == info.multi_devices[0].type);
 
-		info.pack_images |= device.pack_images;
 		info.has_bindless_textures &= device.has_bindless_textures;
 	}
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index b3b693c630c..26d6d380a10 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -53,7 +53,6 @@ public:
 	int num;
 	bool display_device;
 	bool advanced_shading;
-	bool pack_images;
 	bool has_bindless_textures; /* flag for GPU and Multi device */
 	bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
 	vector<DeviceInfo> multi_devices;
@@ -65,7 +64,6 @@ public:
 		num = 0;
 		display_device = false;
 		advanced_shading = true;
-		pack_images = false;
 		has_bindless_textures = false;
 		use_split_kernel = false;
 	}
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index a00be3eeaab..6e09c5f88c2 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -977,7 +977,6 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	info.id = "CPU";
 	info.num = 0;
 	info.advanced_shading = true;
-	info.pack_images = false;
 
 	devices.insert(devices.begin(), info);
 }
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index dbf636e1405..6769ed0229e 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -2164,7 +2164,6 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 
 		info.advanced_shading = (major >= 2);
 		info.has_bindless_textures = (major >= 3);
-		info.pack_images = false;
 
 		int pci_location[3] = {0, 0, 0};
 		cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 681b8214b03..aa380ec4b94 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -95,7 +95,6 @@ void device_opencl_info(vector<DeviceInfo>& devices)
 		/* We don't know if it's used for display, but assume it is. */
 		info.display_device = true;
 		info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name);
-		info.pack_images = true;
 		info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name,
 		                                                     device_type);
 		info.id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
new file mode 100644
index 00000000000..b67dfef88aa
--- /dev/null
+++ b/intern/cycles/device/opencl/memory_manager.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "util/util_foreach.h"
+
+#include "device/opencl/opencl.h"
+#include "device/opencl/memory_manager.h"
+
+CCL_NAMESPACE_BEGIN
+
+void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation)
+{
+	allocations.push_back(&allocation);
+}
+
+void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device)
+{
+	bool need_realloc = false;
+
+	/* Calculate total size and remove any freed. */
+	size_t total_size = 0;
+
+	for(int i = allocations.size()-1; i >= 0; i--) {
+		Allocation* allocation = allocations[i];
+
+		/* Remove allocations that have been freed. */
+		if(!allocation->mem || allocation->mem->memory_size() == 0) {
+			allocation->device_buffer = NULL;
+			allocation->size = 0;
+
+			allocations.erase(allocations.begin()+i);
+
+			need_realloc = true;
+
+			continue;
+		}
+
+		/* Get actual size for allocation. */
+		size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
+
+		if(allocation->size != alloc_size) {
+			/* Allocation is either new or resized. */
+			allocation->size = alloc_size;
+			allocation->needs_copy_to_device = true;
+
+			need_realloc = true;
+		}
+
+		total_size += alloc_size;
+	}
+
+	if(need_realloc) {
+		cl_ulong max_buffer_size;
+		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+		if(total_size > max_buffer_size) {
+			device->set_error("Scene too complex to fit in available memory.");
+			return;
+		}
+
+		device_memory *new_buffer = new device_memory;
+
+		new_buffer->resize(total_size);
+		device->mem_alloc(string_printf("buffer_%p", this).data(), *new_buffer, MEM_READ_ONLY);
+
+		size_t offset = 0;
+
+		foreach(Allocation* allocation, allocations) {
+			if(allocation->needs_copy_to_device) {
+				/* Copy from host to device. */
+				opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(new_buffer->device_pointer),
+					CL_FALSE,
+					offset,
+					allocation->mem->memory_size(),
+					(void*)allocation->mem->data_pointer,
+					0, NULL, NULL
+				));
+
+				allocation->needs_copy_to_device = false;
+			}
+			else {
+				/* Fast copy from memory already on device. */
+				opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(buffer->device_pointer),
+					CL_MEM_PTR(new_buffer->device_pointer),
+					allocation->desc.offset,
+					offset,
+					allocation->mem->memory_size(),
+					0, NULL, NULL
+				));
+			}
+
+			allocation->desc.offset = offset;
+			offset += allocation->size;
+		}
+
+		device->mem_free(*buffer);
+		delete buffer;
+
+		buffer = new_buffer;
+	}
+	else {
+		assert(total_size == buffer->data_size);
+
+		size_t offset = 0;
+
+		foreach(Allocation* allocation, allocations) {
+			if(allocation->needs_copy_to_device) {
+				/* Copy from host to device. */
+				opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(buffer->device_pointer),
+					CL_FALSE,
+					offset,
+					allocation->mem->memory_size(),
+					(void*)allocation->mem->data_pointer,
+					0, NULL, NULL
+				));
+
+				allocation->needs_copy_to_device = false;
+			}
+
+			offset += allocation->size;
+		}
+	}
+
+	/* Not really necessary, but seems to improve responsiveness for some reason. */
+	clFinish(device->cqCommandQueue);
+}
+
+void MemoryManager::DeviceBuffer::free(OpenCLDeviceBase *device)
+{
+	device->mem_free(*buffer);
+}
+
+MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer()
+{
+	DeviceBuffer* smallest = device_buffers;
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		if(device_buffer.size < smallest->size) {
+			smallest = &device_buffer;
+		}
+	}
+
+	return smallest;
+}
+
+MemoryManager::MemoryManager(OpenCLDeviceBase *device) : device(device), need_update(false)
+{
+}
+
+void MemoryManager::free()
+{
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		device_buffer.free(device);
+	}
+}
+
+void MemoryManager::alloc(const char *name, device_memory& mem)
+{
+	Allocation& allocation = allocations[name];
+
+	allocation.mem = &mem;
+	allocation.needs_copy_to_device = true;
+
+	if(!allocation.device_buffer) {
+		DeviceBuffer* device_buffer = smallest_device_buffer();
+		allocation.device_buffer = device_buffer;
+
+		allocation.desc.device_buffer = device_buffer - device_buffers;
+
+		device_buffer->add_allocation(allocation);
+
+		device_buffer->size += mem.memory_size();
+	}
+
+	need_update = true;
+}
+
+bool MemoryManager::free(device_memory& mem)
+{
+	foreach(AllocationsMap::value_type& value, allocations) {
+		Allocation& allocation = value.second;
+		if(allocation.mem == &mem) {
+
+			allocation.device_buffer->size -= mem.memory_size();
+
+			allocation.mem = NULL;
+			allocation.needs_copy_to_device = false;
+
+			need_update = true;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
+{
+	update_device_memory();
+
+	Allocation& allocation = allocations[name];
+	return allocation.desc;
+}
+
+void MemoryManager::update_device_memory()
+{
+	if(!need_update) {
+		return;
+	}
+
+	need_update = false;
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		device_buffer.update_device_memory(device);
+	}
+}
+
+void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
+{
+	update_device_memory();
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		if(device_buffer.buffer->device_pointer) {
+			device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
+		}
+		else {
+			device->kernel_set_args(kernel, (*narg)++, device->null_mem);
+		}
+	}
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* WITH_OPENCL */
+
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
new file mode 100644
index 00000000000..3714405d026
--- /dev/null
+++ b/intern/cycles/device/opencl/memory_manager.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device.h"
+
+#include "util/util_map.h"
+#include "util/util_vector.h"
+#include "util/util_string.h"
+
+#include "clew.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OpenCLDeviceBase;
+
+class MemoryManager {
+public:
+	static const int NUM_DEVICE_BUFFERS = 8;
+
+	struct BufferDescriptor {
+		uint device_buffer;
+		cl_ulong offset;
+	};
+
+private:
+	struct DeviceBuffer;
+
+	struct Allocation {
+		device_memory *mem;
+
+		DeviceBuffer *device_buffer;
+		size_t size; /* Size of actual allocation, may be larger than requested. */
+
+		BufferDescriptor desc;
+
+		bool needs_copy_to_device;
+
+		Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
+		{
+		}
+	};
+
+	struct DeviceBuffer {
+		device_memory *buffer;
+		vector<Allocation*> allocations;
+		size_t size; /* Size of all allocations. */
+
+		DeviceBuffer() : buffer(new device_memory), size(0)
+		{
+		}
+
+		~DeviceBuffer() {
+			delete buffer;
+			buffer = NULL;
+		}
+
+		void add_allocation(Allocation& allocation);
+
+		void update_device_memory(OpenCLDeviceBase *device);
+
+		void free(OpenCLDeviceBase *device);
+	};
+
+	OpenCLDeviceBase *device;
+
+	DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
+
+	typedef unordered_map<string, Allocation> AllocationsMap;
+	AllocationsMap allocations;
+
+	bool need_update;
+
+	DeviceBuffer* smallest_device_buffer();
+
+public:
+	MemoryManager(OpenCLDeviceBase *device);
+
+	void free(); /* Free all memory. */
+
+	void alloc(const char *name, device_memory& mem);
+	bool free(device_memory& mem);
+
+	BufferDescriptor get_descriptor(string name);
+
+	void update_device_memory();
+	void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
+};
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 78ca377d933..0dae9136870 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -25,6 +25,8 @@
 
 #include "clew.h"
 
+#include "device/opencl/memory_manager.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Disable workarounds, seems to be working fine on latest drivers. */
@@ -224,6 +226,18 @@ public:
 	static string get_kernel_md5();
 };
 
+#define opencl_device_assert(device, stmt) \
+	{ \
+		cl_int err = stmt; \
+		\
+		if(err != CL_SUCCESS) { \
+			string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
+			if((device)->error_msg == "") \
+				(device)->error_msg = message; \
+			fprintf(stderr, "%s\n", message.c_str()); \
+		} \
+	} (void)0
+
 #define opencl_assert(stmt) \
 	{ \
 		cl_int err = stmt; \
@@ -344,6 +358,7 @@ public:
 	size_t global_size_round_up(int group_size, int global_size);
 	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
 	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
+	void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
 
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
 	void shader(DeviceTask& task);
@@ -525,6 +540,34 @@ protected:
 
 	virtual string build_options_for_base_program(
 	        const DeviceRequestedFeatures& /*requested_features*/);
+
+private:
+	MemoryManager memory_manager;
+	friend MemoryManager;
+
+	struct tex_info_t {
+		uint buffer, padding;
+		cl_ulong offset;
+		uint width, height, depth, options;
+	};
+	static_assert_align(tex_info_t, 16);
+
+	vector<tex_info_t> texture_descriptors;
+	device_memory texture_descriptors_buffer;
+
+	struct Texture {
+		device_memory* mem;
+		InterpolationType interpolation;
+		ExtensionType extension;
+	};
+
+	typedef map<string, Texture> TexturesMap;
+	TexturesMap textures;
+
+	bool textures_need_update;
+
+protected:
+	void flush_texture_buffers();
 };
 
 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background);
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 509da7a0a84..63b5e004b7d 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -63,7 +63,7 @@ void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
 }
 
 OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
-: Device(info, stats, background_)
+: Device(info, stats, background_), memory_manager(this)
 {
 	cpPlatform = NULL;
 	cdDevice = NULL;
@@ -71,6 +71,7 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	cqCommandQueue = NULL;
 	null_mem = 0;
 	device_initialized = false;
+	textures_need_update = true;
 
 	vector<OpenCLPlatformDevice> usable_devices;
 	OpenCLInfo::get_usable_devices(&usable_devices);
@@ -126,6 +127,12 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 		return;
 	}
 
+	/* Allocate this right away so that texture_descriptors_buffer is placed at offset 0 in the device memory buffers */
+	texture_descriptors.resize(1);
+	texture_descriptors_buffer.resize(1);
+	texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0];
+	memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
+
 	fprintf(stderr, "Device init success\n");
 	device_initialized = true;
 }
@@ -134,6 +141,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase()
 {
 	task_pool.stop();
 
+	memory_manager.free();
+
 	if(null_mem)
 		clReleaseMemObject(CL_MEM_PTR(null_mem));
 
@@ -493,29 +502,31 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 
 void OpenCLDeviceBase::tex_alloc(const char *name,
                device_memory& mem,
-               InterpolationType /*interpolation*/,
-               ExtensionType /*extension*/)
+               InterpolationType interpolation,
+               ExtensionType extension)
 {
 	VLOG(1) << "Texture allocate: " << name << ", "
 	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 	        << string_human_readable_size(mem.memory_size()) << ")";
-	mem_alloc(NULL, mem, MEM_READ_ONLY);
-	mem_copy_to(mem);
-	assert(mem_map.find(name) == mem_map.end());
-	mem_map.insert(MemMap::value_type(name, mem.device_pointer));
+
+	memory_manager.alloc(name, mem);
+
+	textures[name] = {&mem, interpolation, extension};
+
+	textures_need_update = true;
 }
 
 void OpenCLDeviceBase::tex_free(device_memory& mem)
 {
-	if(mem.device_pointer) {
-		foreach(const MemMap::value_type& value, mem_map) {
-			if(value.second == mem.device_pointer) {
-				mem_map.erase(value.first);
-				break;
-			}
-		}
+	if(memory_manager.free(mem)) {
+		textures_need_update = true;
+	}
 
-		mem_free(mem);
+	foreach(TexturesMap::value_type& value, textures) {
+		if(value.second.mem == &mem) {
+			textures.erase(value.first);
+			break;
+		}
 	}
 }
 
@@ -581,6 +592,104 @@ void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const
 	opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
 }
 
+void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
+{
+	flush_texture_buffers();
+
+	memory_manager.set_kernel_arg_buffers(kernel, narg);
+}
+
+void OpenCLDeviceBase::flush_texture_buffers()
+{
+	if(!textures_need_update) {
+		return;
+	}
+	textures_need_update = false;
+
+	/* Setup slots for textures. */
+	int num_slots = 0;
+
+	struct texture_slot_t {
+		string name;
+		int slot;
+	};
+
+	vector<texture_slot_t> texture_slots;
+
+#define KERNEL_TEX(type, ttype, name) \
+	if(textures.find(#name) != textures.end()) { \
+		texture_slots.push_back({#name, num_slots}); \
+	} \
+	num_slots++;
+#include "kernel/kernel_textures.h"
+
+	int num_data_slots = num_slots;
+
+	foreach(TexturesMap::value_type& tex, textures) {
+		string name = tex.first;
+
+		if(string_startswith(name, "__tex_image")) {
+			int pos = name.rfind("_");
+			int id = atoi(name.data() + pos + 1);
+
+			texture_slots.push_back({name, num_data_slots + id});
+
+			num_slots = max(num_slots, num_data_slots + id + 1);
+		}
+	}
+
+	/* Realloc texture descriptors buffer. */
+	memory_manager.free(texture_descriptors_buffer);
+
+	texture_descriptors.resize(num_slots);
+	texture_descriptors_buffer.resize(num_slots * sizeof(tex_info_t));
+	texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0];
+
+	memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
+
+	/* Fill in descriptors */
+	foreach(texture_slot_t& slot, texture_slots) {
+		Texture& tex = textures[slot.name];
+
+		tex_info_t& info = texture_descriptors[slot.slot];
+
+		MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
+
+		info.offset = desc.offset;
+		info.buffer = desc.device_buffer;
+
+		if(string_startswith(slot.name, "__tex_image")) {
+			info.width = tex.mem->data_width;
+			info.height = tex.mem->data_height;
+			info.depth = tex.mem->data_depth;
+
+			info.options = 0;
+
+			if(tex.interpolation == INTERPOLATION_CLOSEST) {
+				info.options |= (1 << 0);
+			}
+
+			switch(tex.extension) {
+				case EXTENSION_REPEAT:
+					info.options |= (1 << 1);
+					break;
+				case EXTENSION_EXTEND:
+					info.options |= (1 << 2);
+					break;
+				case EXTENSION_CLIP:
+					info.options |= (1 << 3);
+					break;
+				default:
+					break;
+			}
+		}
+	}
+
+	/* Force write of descriptors. */
+	memory_manager.free(texture_descriptors_buffer);
+	memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
+}
+
 void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
 {
 	/* cast arguments to cl types */
@@ -605,10 +714,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_
 		                d_rgba,
 		                d_buffer);
 
-#define KERNEL_TEX(type, ttype, name) \
-set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
-#include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
+	set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
 
 	start_arg_index += kernel_set_args(ckFilmConvertKernel,
 	                                   start_arg_index,
@@ -1030,10 +1136,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
 		                                   d_output_luma);
 	}
 
-#define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(kernel, &start_arg_index, #name);
-#include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
+	set_kernel_arg_buffers(kernel, &start_arg_index);
 
 	start_arg_index += kernel_set_args(kernel,
 	                                   start_arg_index,
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index 06c15bcf401..ec47fdafa3d 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -82,10 +82,7 @@ public:
 			                d_buffer,
 			                d_rng_state);
 
-#define KERNEL_TEX(type, ttype, name) \
-		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
-#include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
+		set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index);
 
 		start_arg_index += kernel_set_args(ckPathTraceKernel,
 		                                   start_arg_index,
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 76d9983e9a2..df7c064a24f 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -99,6 +99,8 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
+		flush_texture_buffers();
+
 		if(task->type == DeviceTask::FILM_CONVERT) {
 			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
 		}
@@ -113,10 +115,19 @@ public:
 			 */
 			typedef struct KernelGlobals {
 				ccl_constant KernelData *data;
+				ccl_global char *buffers[8];
+
+				typedef struct _tex_info_t {
+					uint buffer, padding;
+					ulong offset;
+					uint width, height, depth, options;
+				} _tex_info_t;
+
 #define KERNEL_TEX(type, ttype, name) \
-				ccl_global type *name;
+				_tex_info_t name;
 #include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
+
 				SplitData split_data;
 				SplitParams split_param_data;
 			} KernelGlobals;
@@ -217,11 +228,7 @@ public:
 					            *cached_memory.ray_state,
 					            *cached_memory.rng_state);
 
-/* TODO(sergey): Avoid map lookup here. */
-#define KERNEL_TEX(type, ttype, name) \
-				device->set_kernel_arg_mem(program(), &start_arg_index, #name);
-#include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
+				device->set_kernel_arg_buffers(program(), &start_arg_index);
 
 			start_arg_index +=
 				device->kernel_set_args(program(),
@@ -352,11 +359,7 @@ public:
 			                ray_state,
 			                rtile.rng_state);
 
-/* TODO(sergey): Avoid map lookup here. */
-#define KERNEL_TEX(type, ttype, name) \
-	device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name);
-#include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
+			device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index);
 
 		start_arg_index +=
 			device->kernel_set_args(device->program_data_init(),
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index ece99b4313a..21eba971688 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -142,7 +142,7 @@
 
 /* data lookup defines */
 #define kernel_data (*kg->data)
-#define kernel_tex_fetch(t, index) kg->t[index]
+#define kernel_tex_fetch(tex, index) ((ccl_global tex##_t*)(kg->buffers[kg->tex.buffer] + kg->tex.offset))[(index)]
 
 /* define NULL */
 #define NULL 0
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index f95f0d98c52..c078f09e1d7 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -23,6 +23,10 @@
 #  include "util/util_vector.h"
 #endif
 
+#ifdef __KERNEL_OPENCL__
+#  include "util/util_atomic.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -109,11 +113,22 @@ typedef struct KernelGlobals {
 
 #ifdef __KERNEL_OPENCL__
 
+#  define KERNEL_TEX(type, ttype, name) \
+typedef type name##_t;
+#  include "kernel/kernel_textures.h"
+
+typedef struct tex_info_t {
+	uint buffer, padding;
+	ulong offset;
+	uint width, height, depth, options;
+} tex_info_t;
+
 typedef ccl_addr_space struct KernelGlobals {
 	ccl_constant KernelData *data;
+	ccl_global char *buffers[8];
 
 #  define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name;
+	tex_info_t name;
 #  include "kernel/kernel_textures.h"
 
 #  ifdef __SPLIT_KERNEL__
@@ -122,6 +137,57 @@ typedef ccl_addr_space struct KernelGlobals {
 #  endif
 } KernelGlobals;
 
+#define KERNEL_BUFFER_PARAMS \
+	ccl_global char *buffer0, \
+	ccl_global char *buffer1, \
+	ccl_global char *buffer2, \
+	ccl_global char *buffer3, \
+	ccl_global char *buffer4, \
+	ccl_global char *buffer5, \
+	ccl_global char *buffer6, \
+	ccl_global char *buffer7
+
+#define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
+
+ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
+{
+#ifdef __SPLIT_KERNEL__
+	if(ccl_local_id(0) + ccl_local_id(1) == 0)
+#endif
+	{
+		kg->buffers[0] = buffer0;
+		kg->buffers[1] = buffer1;
+		kg->buffers[2] = buffer2;
+		kg->buffers[3] = buffer3;
+		kg->buffers[4] = buffer4;
+		kg->buffers[5] = buffer5;
+		kg->buffers[6] = buffer6;
+		kg->buffers[7] = buffer7;
+	}
+
+#  ifdef __SPLIT_KERNEL__
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+#  endif
+}
+
+ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
+{
+#  ifdef __SPLIT_KERNEL__
+	if(ccl_local_id(0) + ccl_local_id(1) == 0)
+#  endif
+	{
+		ccl_global tex_info_t *info = (ccl_global tex_info_t*)kg->buffers[0];
+
+#  define KERNEL_TEX(type, ttype, name) \
+		kg->name = *(info++);
+#  include "kernel/kernel_textures.h"
+	}
+
+#  ifdef __SPLIT_KERNEL__
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+#  endif
+}
+
 #endif  /* __KERNEL_OPENCL__ */
 
 /* Interpolated lookup table access */
diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h
index 90747e09357..9e3373432ec 100644
--- a/intern/cycles/kernel/kernel_image_opencl.h
+++ b/intern/cycles/kernel/kernel_image_opencl.h
@@ -15,30 +15,42 @@
  */
 
 
-/* For OpenCL all images are packed in a single array, and we do manual lookup
- * and interpolation. */
+/* For OpenCL we do manual lookup and interpolation. */
+
+ccl_device_inline ccl_global tex_info_t* kernel_tex_info(KernelGlobals *kg, uint id) {
+	const uint tex_offset = id
+#define KERNEL_TEX(type, ttype, name) + 1
+#include "kernel/kernel_textures.h"
+	;
+
+	return &((ccl_global tex_info_t*)kg->buffers[0])[tex_offset];
+}
+
+#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->buffer] + info->offset))[(index)]
 
 ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
 {
+	const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
 	const int texture_type = kernel_tex_type(id);
+
 	/* Float4 */
 	if(texture_type == IMAGE_DATA_TYPE_FLOAT4) {
-		return kernel_tex_fetch(__tex_image_float4_packed, offset);
+		return tex_fetch(float4, info, offset);
 	}
 	/* Byte4 */
 	else if(texture_type == IMAGE_DATA_TYPE_BYTE4) {
-		uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
+		uchar4 r = tex_fetch(uchar4, info, offset);
 		float f = 1.0f/255.0f;
 		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
 	}
 	/* Float */
 	else if(texture_type == IMAGE_DATA_TYPE_FLOAT) {
-		float f = kernel_tex_fetch(__tex_image_float_packed, offset);
+		float f = tex_fetch(float, info, offset);
 		return make_float4(f, f, f, 1.0f);
 	}
 	/* Byte */
 	else {
-		uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset);
+		uchar r = tex_fetch(uchar, info, offset);
 		float f = r * (1.0f/255.0f);
 		return make_float4(f, f, f, 1.0f);
 	}
@@ -64,17 +76,17 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
 	return x - (float)i;
 }
 
-ccl_device_inline uint kernel_decode_image_interpolation(uint4 info)
+ccl_device_inline uint kernel_decode_image_interpolation(uint info)
 {
-	return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
+	return (info & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
 }
 
-ccl_device_inline uint kernel_decode_image_extension(uint4 info)
+ccl_device_inline uint kernel_decode_image_extension(uint info)
 {
-	if(info.w & (1 << 1)) {
+	if(info & (1 << 1)) {
 		return EXTENSION_REPEAT;
 	}
-	else if(info.w & (1 << 2)) {
+	else if(info & (1 << 2)) {
 		return EXTENSION_EXTEND;
 	}
 	else {
@@ -84,13 +96,16 @@ ccl_device_inline uint kernel_decode_image_extension(uint4 info)
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
 {
-	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
-	uint width = info.x;
-	uint height = info.y;
-	uint offset = info.z;
+	const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
+
+	uint width = info->width;
+	uint height = info->height;
+	uint offset = 0;
+
 	/* Decode image options. */
-	uint interpolation = kernel_decode_image_interpolation(info);
-	uint extension = kernel_decode_image_extension(info);
+	uint interpolation = kernel_decode_image_interpolation(info->options);
+	uint extension = kernel_decode_image_extension(info->options);
+
 	/* Actual sampling. */
 	float4 r;
 	int ix, iy, nix, niy;
@@ -150,14 +165,17 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
 
 ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z)
 {
-	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
-	uint width = info.x;
-	uint height = info.y;
-	uint offset = info.z;
-	uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x;
+	const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
+
+	uint width = info->width;
+	uint height = info->height;
+	uint offset = 0;
+	uint depth = info->depth;
+
 	/* Decode image options. */
-	uint interpolation = kernel_decode_image_interpolation(info);
-	uint extension = kernel_decode_image_extension(info);
+	uint interpolation = kernel_decode_image_interpolation(info->options);
+	uint extension = kernel_decode_image_extension(info->options);
+
 	/* Actual sampling. */
 	float4 r;
 	int ix, iy, iz, nix, niy, niz;
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index aa5b32803a5..dc6bbbb9924 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -184,15 +184,8 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665)
 #  else
 /* bindless textures */
 KERNEL_TEX(uint, texture_uint, __bindless_mapping)
-#  endif
-#endif
-
-/* packed image (opencl) */
-KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
-KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed)
-KERNEL_TEX(uchar, texture_uchar, __tex_image_byte_packed)
-KERNEL_TEX(float, texture_float, __tex_image_float_packed)
-KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info)
+#  endif  /* __CUDA_ARCH__ */
+#endif  /* __KERNEL_CUDA__ */
 
 #undef KERNEL_TEX
 #undef KERNEL_IMAGE_TEX
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 078acc1631e..83d63b4fba3 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -52,9 +52,7 @@ __kernel void kernel_ocl_path_trace(
 	ccl_global float *buffer,
 	ccl_global uint *rng_state,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel/kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int sample,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -63,9 +61,8 @@ __kernel void kernel_ocl_path_trace(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
 	int x = sx + ccl_global_id(0);
 	int y = sy + ccl_global_id(1);
@@ -82,9 +79,7 @@ __kernel void kernel_ocl_shader(
 	ccl_global float4 *output,
 	ccl_global float *output_luma,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel/kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int type, int sx, int sw, int offset, int sample)
 {
@@ -92,9 +87,8 @@ __kernel void kernel_ocl_shader(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
 	int x = sx + ccl_global_id(0);
 
@@ -114,9 +108,7 @@ __kernel void kernel_ocl_bake(
 	ccl_global uint4 *input,
 	ccl_global float4 *output,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel/kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int type, int filter, int sx, int sw, int offset, int sample)
 {
@@ -124,9 +116,8 @@ __kernel void kernel_ocl_bake(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
 	int x = sx + ccl_global_id(0);
 
@@ -144,9 +135,7 @@ __kernel void kernel_ocl_convert_to_byte(
 	ccl_global uchar4 *rgba,
 	ccl_global float *buffer,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel/kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -155,9 +144,8 @@ __kernel void kernel_ocl_convert_to_byte(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
 	int x = sx + ccl_global_id(0);
 	int y = sy + ccl_global_id(1);
@@ -171,9 +159,7 @@ __kernel void kernel_ocl_convert_to_half_float(
 	ccl_global uchar4 *rgba,
 	ccl_global float *buffer,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel/kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -182,9 +168,8 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
 	int x = sx + ccl_global_id(0);
 	int y = sy + ccl_global_id(1);
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 8b85d362f8a..95b35e40a45 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -25,11 +25,7 @@ __kernel void kernel_ocl_path_trace_data_init(
         int num_elements,
         ccl_global char *ray_state,
         ccl_global uint *rng_state,
-
-#define KERNEL_TEX(type, ttype, name)                                   \
-        ccl_global type *name,
-#include "kernel/kernel_textures.h"
-
+		KERNEL_BUFFER_PARAMS,
         int start_sample,
         int end_sample,
         int sx, int sy, int sw, int sh, int offset, int stride,
@@ -46,10 +42,7 @@ __kernel void kernel_ocl_path_trace_data_init(
 	                 num_elements,
 	                 ray_state,
 	                 rng_state,
-
-#define KERNEL_TEX(type, ttype, name) name,
-#include "kernel/kernel_textures.h"
-
+	                 KERNEL_BUFFER_ARGS,
 	                 start_sample,
 	                 end_sample,
 	                 sx, sy, sw, sh, offset, stride,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
index f1e914a70d4..591c3846ef2 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
@@ -25,9 +25,7 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
 		ccl_global char *ray_state,
 		ccl_global uint *rng_state,
 
-#define KERNEL_TEX(type, ttype, name) \
-		ccl_global type *name,
-#include "kernel/kernel_textures.h"
+		KERNEL_BUFFER_PARAMS,
 
 		ccl_global int *queue_index,
 		ccl_global char *use_queues_flag,
@@ -52,12 +50,9 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
 
 		split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
 
-#define KERNEL_TEX(type, ttype, name) \
-		kg->name = name;
-#include "kernel/kernel_textures.h"
 	}
 
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
 
 	KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
 			kg
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index e4545d66eff..6f3297de342 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -52,9 +52,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
         ccl_global uint *rng_state,
 
 #ifdef __KERNEL_OPENCL__
-#define KERNEL_TEX(type, ttype, name)                                   \
-        ccl_global type *name,
-#include "kernel/kernel_textures.h"
+		KERNEL_BUFFER_PARAMS,
 #endif
 
         int start_sample,
@@ -100,9 +98,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 	split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
 
 #ifdef __KERNEL_OPENCL__
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 #endif
 
 	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index a490f10aee4..80ec77f8b4a 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -43,7 +43,6 @@ static bool isfinite(half /*value*/)
 ImageManager::ImageManager(const DeviceInfo& info)
 {
 	need_update = true;
-	pack_images = false;
 	osl_texture_system = NULL;
 	animation_frame = 0;
 
@@ -87,11 +86,6 @@ ImageManager::~ImageManager()
 	}
 }
 
-void ImageManager::set_pack_images(bool pack_images_)
-{
-	pack_images = pack_images_;
-}
-
 void ImageManager::set_osl_texture_system(void *texture_system)
 {
 	osl_texture_system = texture_system;
@@ -742,7 +736,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[3] = TEX_IMAGE_MISSING_A;
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -771,7 +765,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[0] = TEX_IMAGE_MISSING_R;
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -803,7 +797,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[3] = (TEX_IMAGE_MISSING_A * 255);
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -831,7 +825,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[0] = (TEX_IMAGE_MISSING_R * 255);
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -862,7 +856,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[3] = TEX_IMAGE_MISSING_A;
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -890,7 +884,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[0] = TEX_IMAGE_MISSING_R;
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -1047,9 +1041,6 @@ void ImageManager::device_update(Device *device,
 
 	pool.wait_work();
 
-	if(pack_images)
-		device_pack_images(device, dscene, progress);
-
 	need_update = false;
 }
 
@@ -1079,141 +1070,6 @@ void ImageManager::device_update_slot(Device *device,
 	}
 }
 
-uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot)
-{
-	uint8_t options = 0;
-	/* Image Options are packed into one uint:
-	 * bit 0 -> Interpolation
-	 * bit 1 + 2 + 3 -> Extension
-	 */
-	if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST) {
-		options |= (1 << 0);
-	}
-	if(images[type][slot]->extension == EXTENSION_REPEAT) {
-		options |= (1 << 1);
-	}
-	else if(images[type][slot]->extension == EXTENSION_EXTEND) {
-		options |= (1 << 2);
-	}
-	else /* EXTENSION_CLIP */ {
-		options |= (1 << 3);
-	}
-	return options;
-}
-
-template<typename T>
-void ImageManager::device_pack_images_type(
-        ImageDataType type,
-        const vector<device_vector<T>*>& cpu_textures,
-        device_vector<T> *device_image,
-        uint4 *info)
-{
-	size_t size = 0, offset = 0;
-	/* First step is to calculate size of the texture we need. */
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(images[type][slot] == NULL) {
-			continue;
-		}
-		device_vector<T>& tex_img = *cpu_textures[slot];
-		size += tex_img.size();
-	}
-	/* Now we know how much memory we need, so we can allocate and fill. */
-	T *pixels = device_image->resize(size);
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(images[type][slot] == NULL) {
-			continue;
-		}
-		device_vector<T>& tex_img = *cpu_textures[slot];
-		uint8_t options = pack_image_options(type, slot);
-		const int index = type_index_to_flattened_slot(slot, type) * 2;
-		info[index] = make_uint4(tex_img.data_width,
-		                         tex_img.data_height,
-		                         offset,
-		                         options);
-		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-		memcpy(pixels + offset,
-		       (void*)tex_img.data_pointer,
-		       tex_img.memory_size());
-		offset += tex_img.size();
-	}
-}
-
-void ImageManager::device_pack_images(Device *device,
-                                      DeviceScene *dscene,
-                                      Progress& /*progess*/)
-{
-	/* For OpenCL, we pack all image textures into a single large texture, and
-	 * do our own interpolation in the kernel.
-	 */
-
-	/* TODO(sergey): This will over-allocate a bit, but this is constant memory
-	 * so should be fine for a short term.
-	 */
-	const size_t info_size = max4(max_flattened_slot(IMAGE_DATA_TYPE_FLOAT4),
-	                              max_flattened_slot(IMAGE_DATA_TYPE_BYTE4),
-	                              max_flattened_slot(IMAGE_DATA_TYPE_FLOAT),
-	                              max_flattened_slot(IMAGE_DATA_TYPE_BYTE));
-	uint4 *info = dscene->tex_image_packed_info.resize(info_size*2);
-
-	/* Pack byte4 textures. */
-	device_pack_images_type(IMAGE_DATA_TYPE_BYTE4,
-	                        dscene->tex_byte4_image,
-	                        &dscene->tex_image_byte4_packed,
-	                        info);
-	/* Pack float4 textures. */
-	device_pack_images_type(IMAGE_DATA_TYPE_FLOAT4,
-	                        dscene->tex_float4_image,
-	                        &dscene->tex_image_float4_packed,
-	                        info);
-	/* Pack byte textures. */
-	device_pack_images_type(IMAGE_DATA_TYPE_BYTE,
-	                        dscene->tex_byte_image,
-	                        &dscene->tex_image_byte_packed,
-	                        info);
-	/* Pack float textures. */
-	device_pack_images_type(IMAGE_DATA_TYPE_FLOAT,
-	                        dscene->tex_float_image,
-	                        &dscene->tex_image_float_packed,
-	                        info);
-
-	/* Push textures to the device. */
-	if(dscene->tex_image_byte4_packed.size()) {
-		if(dscene->tex_image_byte4_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_byte4_packed);
-		}
-		device->tex_alloc("__tex_image_byte4_packed", dscene->tex_image_byte4_packed);
-	}
-	if(dscene->tex_image_float4_packed.size()) {
-		if(dscene->tex_image_float4_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_float4_packed);
-		}
-		device->tex_alloc("__tex_image_float4_packed", dscene->tex_image_float4_packed);
-	}
-	if(dscene->tex_image_byte_packed.size()) {
-		if(dscene->tex_image_byte_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_byte_packed);
-		}
-		device->tex_alloc("__tex_image_byte_packed", dscene->tex_image_byte_packed);
-	}
-	if(dscene->tex_image_float_packed.size()) {
-		if(dscene->tex_image_float_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_float_packed);
-		}
-		device->tex_alloc("__tex_image_float_packed", dscene->tex_image_float_packed);
-	}
-	if(dscene->tex_image_packed_info.size()) {
-		if(dscene->tex_image_packed_info.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_packed_info);
-		}
-		device->tex_alloc("__tex_image_packed_info", dscene->tex_image_packed_info);
-	}
-}
-
 void ImageManager::device_free_builtin(Device *device, DeviceScene *dscene)
 {
 	for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
@@ -1239,18 +1095,6 @@ void ImageManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->tex_float_image.clear();
 	dscene->tex_byte_image.clear();
 	dscene->tex_half_image.clear();
-
-	device->tex_free(dscene->tex_image_float4_packed);
-	device->tex_free(dscene->tex_image_byte4_packed);
-	device->tex_free(dscene->tex_image_float_packed);
-	device->tex_free(dscene->tex_image_byte_packed);
-	device->tex_free(dscene->tex_image_packed_info);
-
-	dscene->tex_image_float4_packed.clear();
-	dscene->tex_image_byte4_packed.clear();
-	dscene->tex_image_float_packed.clear();
-	dscene->tex_image_byte_packed.clear();
-	dscene->tex_image_packed_info.clear();
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index db7e28a5e44..c86d1cbedbf 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -76,7 +76,6 @@ public:
 	void device_free_builtin(Device *device, DeviceScene *dscene);
 
 	void set_osl_texture_system(void *texture_system);
-	void set_pack_images(bool pack_images_);
 	bool set_animation_frame_update(int frame);
 
 	bool need_update;
@@ -130,7 +129,6 @@ private:
 
 	vector<Image*> images[IMAGE_DATA_NUM_TYPES];
 	void *osl_texture_system;
-	bool pack_images;
 
 	bool file_load_image_generic(Image *img,
 	                             ImageInput **in,
@@ -152,8 +150,6 @@ private:
 	int flattened_slot_to_type_index(int flat_slot, ImageDataType *type);
 	string name_from_type(int type);
 
-	uint8_t pack_image_options(ImageDataType type, size_t slot);
-
 	void device_load_image(Device *device,
 	                       DeviceScene *dscene,
 	                       Scene *scene,
@@ -164,17 +160,6 @@ private:
 	                       DeviceScene *dscene,
 	                       ImageDataType type,
 	                       int slot);
-
-	template<typename T>
-	void device_pack_images_type(
-	        ImageDataType type,
-	        const vector<device_vector<T>*>& cpu_textures,
-	        device_vector<T> *device_image,
-	        uint4 *info);
-
-	void device_pack_images(Device *device,
-	                        DeviceScene *dscene,
-	                        Progress& progess);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 03825f780e0..84537bf5993 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -1925,16 +1925,7 @@ void MeshManager::device_update_displacement_images(Device *device,
 					if(node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
 						continue;
 					}
-					if(device->info.pack_images) {
-						/* If device requires packed images we need to update all
-						 * images now, even if they're not used for displacement.
-						 */
-						image_manager->device_update(device,
-						                             dscene,
-						                             scene,
-						                             progress);
-						return;
-					}
+
 					ImageSlotTextureNode *image_node = static_cast<ImageSlotTextureNode*>(node);
 					int slot = image_node->slot;
 					if(slot != -1) {
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 4db20338744..c59a5d97df5 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -148,8 +148,6 @@ void Scene::device_update(Device *device_, Progress& progress)
 	 * - Film needs light manager to run for use_light_visibility
 	 * - Lookup tables are done a second time to handle film tables
 	 */
-	
-	image_manager->set_pack_images(device->info.pack_images);
 
 	progress.set_status("Updating Shaders");
 	shader_manager->device_update(device, &dscene, this, progress);
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 4c2c4f5fcc3..0194327f567 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -121,13 +121,6 @@ public:
 	vector<device_vector<uchar>* > tex_byte_image;
 	vector<device_vector<half>* > tex_half_image;
 
-	/* opencl images */
-	device_vector<float4> tex_image_float4_packed;
-	device_vector<uchar4> tex_image_byte4_packed;
-	device_vector<float> tex_image_float_packed;
-	device_vector<uchar> tex_image_byte_packed;
-	device_vector<uint4> tex_image_packed_info;
-
 	KernelData data;
 };
 
-- 
cgit v1.2.3


From 12834fe0f00daf766e515102a3ff071423b66abe Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 8 Aug 2017 13:58:07 +0200
Subject: Update CUDA wrangler to latest version

Brings new declarations from toolkit version 8.0, also fixes some
pointers used in function declarations.
---
 extern/cuew/README.blender |   2 +-
 extern/cuew/include/cuew.h | 198 ++++++++++++++++++++++++++++++++++++++-------
 extern/cuew/src/cuew.c     |  25 ++++++
 3 files changed, 196 insertions(+), 29 deletions(-)

diff --git a/extern/cuew/README.blender b/extern/cuew/README.blender
index 7b77935d750..ef36c110e3f 100644
--- a/extern/cuew/README.blender
+++ b/extern/cuew/README.blender
@@ -1,5 +1,5 @@
 Project: Cuda Wrangler
 URL: https://github.com/CudaWrangler/cuew
 License: Apache 2.0
-Upstream version: 63d2a0f
+Upstream version: 3dd0b01
 Local modifications: None
diff --git a/extern/cuew/include/cuew.h b/extern/cuew/include/cuew.h
index 4cce29d38ab..c90ab39601a 100644
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -27,7 +27,7 @@ extern "C" {
 #define CUEW_VERSION_MAJOR 1
 #define CUEW_VERSION_MINOR 2
 
-#define CUDA_VERSION 7050
+#define CUDA_VERSION 8000
 #define CU_IPC_HANDLE_SIZE 64
 #define CU_STREAM_LEGACY ((CUstream)0x1)
 #define CU_STREAM_PER_THREAD ((CUstream)0x2)
@@ -51,6 +51,8 @@ extern "C" {
 #define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
 #define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
 #define CU_PARAM_TR_DEFAULT -1
+#define CU_DEVICE_CPU ((CUdevice)-1)
+#define CU_DEVICE_INVALID ((CUdevice)-2)
 
 /* Functions which changed 3.1 -> 3.2 for 64 bit stuff,
  * the cuda library has both the old ones for compatibility and new
@@ -120,6 +122,45 @@ typedef unsigned long long CUdeviceptr;
 typedef unsigned int CUdeviceptr;
 #endif
 
+
+#ifdef _WIN32
+#  define CUDAAPI __stdcall
+#  define CUDA_CB __stdcall
+#else
+#  define CUDAAPI
+#  define CUDA_CB
+#endif
+
+typedef signed char int8_t;
+typedef short int int16_t;
+typedef int int32_t;
+typedef long int int64_t;
+typedef unsigned char uint8_t;
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long int uint64_t;
+typedef signed char int_least8_t;
+typedef short int int_least16_t;
+typedef int int_least32_t;
+typedef long int int_least64_t;
+typedef unsigned char uint_least8_t;
+typedef unsigned short int uint_least16_t;
+typedef unsigned int uint_least32_t;
+typedef unsigned long int uint_least64_t;
+typedef signed char int_fast8_t;
+typedef long int int_fast16_t;
+typedef long int int_fast32_t;
+typedef long int int_fast64_t;
+typedef unsigned char uint_fast8_t;
+typedef unsigned long int uint_fast16_t;
+typedef unsigned long int uint_fast32_t;
+typedef unsigned long int uint_fast64_t;
+typedef long int intptr_t;
+typedef unsigned long int uintptr_t;
+typedef long int intmax_t;
+typedef unsigned long int uintmax_t;
+typedef uint32_t cuuint32_t;
+typedef uint64_t cuuint64_t;
 typedef int CUdevice;
 typedef struct CUctx_st* CUcontext;
 typedef struct CUmod_st* CUmodule;
@@ -180,6 +221,53 @@ typedef enum CUevent_flags_enum {
   CU_EVENT_INTERPROCESS = 0x4,
 } CUevent_flags;
 
+typedef enum CUstreamWaitValue_flags_enum {
+  CU_STREAM_WAIT_VALUE_GEQ = 0x0,
+  CU_STREAM_WAIT_VALUE_EQ = 0x1,
+  CU_STREAM_WAIT_VALUE_AND = 0x2,
+  CU_STREAM_WAIT_VALUE_FLUSH = (1 << 30),
+} CUstreamWaitValue_flags;
+
+typedef enum CUstreamWriteValue_flags_enum {
+  CU_STREAM_WRITE_VALUE_DEFAULT = 0x0,
+  CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1,
+} CUstreamWriteValue_flags;
+
+typedef enum CUstreamBatchMemOpType_enum {
+  CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1,
+  CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,
+  CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3,
+} CUstreamBatchMemOpType;
+
+typedef union CUstreamBatchMemOpParams_union {
+  CUstreamBatchMemOpType operation;
+  struct CUstreamMemOpWaitValueParams_st {
+    CUstreamBatchMemOpType operation;
+    CUdeviceptr address;
+    union {
+      cuuint32_t value;
+      cuuint64_t pad;
+    };
+    unsigned int flags;
+    CUdeviceptr alias;
+  } waitValue;
+  struct CUstreamMemOpWriteValueParams_st {
+    CUstreamBatchMemOpType operation;
+    CUdeviceptr address;
+    union {
+      cuuint32_t value;
+      cuuint64_t pad;
+    };
+    unsigned int flags;
+    CUdeviceptr alias;
+  } writeValue;
+  struct CUstreamMemOpFlushRemoteWritesParams_st {
+    CUstreamBatchMemOpType operation;
+    unsigned int flags;
+  } flushRemoteWrites;
+  cuuint64_t pad[6];
+} CUstreamBatchMemOpParams;
+
 typedef enum CUoccupancy_flags_enum {
   CU_OCCUPANCY_DEFAULT = 0x0,
   CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1,
@@ -299,6 +387,12 @@ typedef enum CUdevice_attribute_enum {
   CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,
   CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,
   CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,
+  CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,
+  CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,
+  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,
+  CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,
+  CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,
   CU_DEVICE_ATTRIBUTE_MAX,
 } CUdevice_attribute;
 
@@ -360,11 +454,26 @@ typedef enum CUmemorytype_enum {
 
 typedef enum CUcomputemode_enum {
   CU_COMPUTEMODE_DEFAULT = 0,
-  CU_COMPUTEMODE_EXCLUSIVE = 1,
   CU_COMPUTEMODE_PROHIBITED = 2,
   CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3,
 } CUcomputemode;
 
+typedef enum CUmem_advise_enum {
+  CU_MEM_ADVISE_SET_READ_MOSTLY = 1,
+  CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2,
+  CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3,
+  CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4,
+  CU_MEM_ADVISE_SET_ACCESSED_BY = 5,
+  CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6,
+} CUmem_advise;
+
+typedef enum CUmem_range_attribute_enum {
+  CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1,
+  CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2,
+  CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3,
+  CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4,
+} CUmem_range_attribute;
+
 typedef enum CUjit_option_enum {
   CU_JIT_MAX_REGISTERS = 0,
   CU_JIT_THREADS_PER_BLOCK,
@@ -381,6 +490,8 @@ typedef enum CUjit_option_enum {
   CU_JIT_LOG_VERBOSE,
   CU_JIT_GENERATE_LINE_INFO,
   CU_JIT_CACHE_MODE,
+  CU_JIT_NEW_SM3X_OPT,
+  CU_JIT_FAST_COMPILE,
   CU_JIT_NUM_OPTIONS,
 } CUjit_option;
 
@@ -397,6 +508,10 @@ typedef enum CUjit_target_enum {
   CU_TARGET_COMPUTE_37 = 37,
   CU_TARGET_COMPUTE_50 = 50,
   CU_TARGET_COMPUTE_52 = 52,
+  CU_TARGET_COMPUTE_53 = 53,
+  CU_TARGET_COMPUTE_60 = 60,
+  CU_TARGET_COMPUTE_61 = 61,
+  CU_TARGET_COMPUTE_62 = 62,
 } CUjit_target;
 
 typedef enum CUjit_fallback_enum {
@@ -490,6 +605,7 @@ typedef enum cudaError_enum {
   CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217,
   CUDA_ERROR_INVALID_PTX = 218,
   CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219,
+  CUDA_ERROR_NVLINK_UNCORRECTABLE = 220,
   CUDA_ERROR_INVALID_SOURCE = 300,
   CUDA_ERROR_FILE_NOT_FOUND = 301,
   CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
@@ -521,8 +637,14 @@ typedef enum cudaError_enum {
   CUDA_ERROR_UNKNOWN = 999,
 } CUresult;
 
-typedef void* CUstreamCallback;
-typedef size_t* CUoccupancyB2DSize;
+typedef enum CUdevice_P2PAttribute_enum {
+  CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01,
+  CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02,
+  CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03,
+} CUdevice_P2PAttribute;
+
+typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void* userData);
+typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
 
 typedef struct CUDA_MEMCPY2D_st {
   size_t srcXInBytes;
@@ -654,7 +776,8 @@ typedef struct CUDA_TEXTURE_DESC_st {
   float mipmapLevelBias;
   float minMipmapLevelClamp;
   float maxMipmapLevelClamp;
-  int reserved[16];
+  float borderColor[4];
+  int reserved[12];
 } CUDA_TEXTURE_DESC;
 
 typedef enum CUresourceViewFormat_enum {
@@ -736,21 +859,16 @@ typedef enum  {
   NVRTC_ERROR_INVALID_OPTION = 5,
   NVRTC_ERROR_COMPILATION = 6,
   NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  NVRTC_ERROR_INTERNAL_ERROR = 11,
 } nvrtcResult;
 
 typedef struct _nvrtcProgram* nvrtcProgram;
-
-#ifdef _WIN32
-#  define CUDAAPI __stdcall
-#  define CUDA_CB __stdcall
-#else
-#  define CUDAAPI
-#  define CUDA_CB
-#endif
-
 /* Function types. */
-typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char* pStr);
-typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char* pStr);
+typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pStr);
+typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pStr);
 typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
 typedef CUresult CUDAAPI tcuDriverGetVersion(int* driverVersion);
 typedef CUresult CUDAAPI tcuDeviceGet(CUdevice* device, int ordinal);
@@ -786,26 +904,26 @@ typedef CUresult CUDAAPI tcuCtxAttach(CUcontext* pctx, unsigned int flags);
 typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx);
 typedef CUresult CUDAAPI tcuModuleLoad(CUmodule* module, const char* fname);
 typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule* module, const void* image);
-typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule* module, const void* image, unsigned int numOptions, CUjit_option* options, void* optionValues);
+typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule* module, const void* image, unsigned int numOptions, CUjit_option* options, void** optionValues);
 typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule* module, const void* fatCubin);
 typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
 typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name);
 typedef CUresult CUDAAPI tcuModuleGetGlobal_v2(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name);
 typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref* pTexRef, CUmodule hmod, const char* name);
 typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref* pSurfRef, CUmodule hmod, const char* name);
-typedef CUresult CUDAAPI tcuLinkCreate_v2(unsigned int numOptions, CUjit_option* options, void* optionValues, CUlinkState* stateOut);
-typedef CUresult CUDAAPI tcuLinkAddData_v2(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name, unsigned int numOptions, CUjit_option* options, void* optionValues);
-typedef CUresult CUDAAPI tcuLinkAddFile_v2(CUlinkState state, CUjitInputType type, const char* path, unsigned int numOptions, CUjit_option* options, void* optionValues);
-typedef CUresult CUDAAPI tcuLinkComplete(CUlinkState state, void* cubinOut, size_t* sizeOut);
+typedef CUresult CUDAAPI tcuLinkCreate_v2(unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut);
+typedef CUresult CUDAAPI tcuLinkAddData_v2(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name, unsigned int numOptions, CUjit_option* options, void** optionValues);
+typedef CUresult CUDAAPI tcuLinkAddFile_v2(CUlinkState state, CUjitInputType type, const char* path, unsigned int numOptions, CUjit_option* options, void** optionValues);
+typedef CUresult CUDAAPI tcuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut);
 typedef CUresult CUDAAPI tcuLinkDestroy(CUlinkState state);
 typedef CUresult CUDAAPI tcuMemGetInfo_v2(size_t* free, size_t* total);
 typedef CUresult CUDAAPI tcuMemAlloc_v2(CUdeviceptr* dptr, size_t bytesize);
 typedef CUresult CUDAAPI tcuMemAllocPitch_v2(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
 typedef CUresult CUDAAPI tcuMemFree_v2(CUdeviceptr dptr);
 typedef CUresult CUDAAPI tcuMemGetAddressRange_v2(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr);
-typedef CUresult CUDAAPI tcuMemAllocHost_v2(void* pp, size_t bytesize);
+typedef CUresult CUDAAPI tcuMemAllocHost_v2(void** pp, size_t bytesize);
 typedef CUresult CUDAAPI tcuMemFreeHost(void* p);
-typedef CUresult CUDAAPI tcuMemHostAlloc(void* pp, size_t bytesize, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags);
 typedef CUresult CUDAAPI tcuMemHostGetDevicePointer_v2(CUdeviceptr* pdptr, void* p, unsigned int Flags);
 typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int* pFlags, void* p);
 typedef CUresult CUDAAPI tcuMemAllocManaged(CUdeviceptr* dptr, size_t bytesize, unsigned int flags);
@@ -863,8 +981,12 @@ typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray* pHandle, cons
 typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray* pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
 typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
 typedef CUresult CUDAAPI tcuPointerGetAttribute(void* data, CUpointer_attribute attribute, CUdeviceptr ptr);
+typedef CUresult CUDAAPI tcuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device);
+typedef CUresult CUDAAPI tcuMemRangeGetAttribute(void* data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count);
+typedef CUresult CUDAAPI tcuMemRangeGetAttributes(void** data, size_t* dataSizes, CUmem_range_attribute* attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count);
 typedef CUresult CUDAAPI tcuPointerSetAttribute(const void* value, CUpointer_attribute attribute, CUdeviceptr ptr);
-typedef CUresult CUDAAPI tcuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute* attributes, void* data, CUdeviceptr ptr);
+typedef CUresult CUDAAPI tcuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute* attributes, void** data, CUdeviceptr ptr);
 typedef CUresult CUDAAPI tcuStreamCreate(CUstream* phStream, unsigned int Flags);
 typedef CUresult CUDAAPI tcuStreamCreateWithPriority(CUstream* phStream, unsigned int flags, int priority);
 typedef CUresult CUDAAPI tcuStreamGetPriority(CUstream hStream, int* priority);
@@ -881,10 +1003,13 @@ typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
 typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
 typedef CUresult CUDAAPI tcuEventDestroy_v2(CUevent hEvent);
 typedef CUresult CUDAAPI tcuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd);
+typedef CUresult CUDAAPI tcuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+typedef CUresult CUDAAPI tcuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+typedef CUresult CUDAAPI tcuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams* paramArray, unsigned int flags);
 typedef CUresult CUDAAPI tcuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc);
 typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
 typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
-typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void* kernelParams, void* extra);
+typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra);
 typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
 typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
 typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
@@ -910,6 +1035,7 @@ typedef CUresult CUDAAPI tcuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter
 typedef CUresult CUDAAPI tcuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
 typedef CUresult CUDAAPI tcuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
 typedef CUresult CUDAAPI tcuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
+typedef CUresult CUDAAPI tcuTexRefSetBorderColor(CUtexref hTexRef, float* pBorderColor);
 typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
 typedef CUresult CUDAAPI tcuTexRefGetAddress_v2(CUdeviceptr* pdptr, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray* phArray, CUtexref hTexRef);
@@ -921,6 +1047,7 @@ typedef CUresult CUDAAPI tcuTexRefGetMipmapFilterMode(CUfilter_mode* pfm, CUtexr
 typedef CUresult CUDAAPI tcuTexRefGetMipmapLevelBias(float* pbias, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp, float* pmaxMipmapLevelClamp, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuTexRefGetMaxAnisotropy(int* pmaxAniso, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetBorderColor(float* pBorderColor, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int* pFlags, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref* pTexRef);
 typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);
@@ -935,6 +1062,7 @@ typedef CUresult CUDAAPI tcuSurfObjectCreate(CUsurfObject* pSurfObject, const CU
 typedef CUresult CUDAAPI tcuSurfObjectDestroy(CUsurfObject surfObject);
 typedef CUresult CUDAAPI tcuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUsurfObject surfObject);
 typedef CUresult CUDAAPI tcuDeviceCanAccessPeer(int* canAccessPeer, CUdevice dev, CUdevice peerDev);
+typedef CUresult CUDAAPI tcuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
 typedef CUresult CUDAAPI tcuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
 typedef CUresult CUDAAPI tcuCtxDisablePeerAccess(CUcontext peerContext);
 typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
@@ -944,7 +1072,7 @@ typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer_v2(CUdeviceptr* pDe
 typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags_v2(CUgraphicsResource resource, unsigned int flags);
 typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
 typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
-typedef CUresult CUDAAPI tcuGetExportTable(const void* ppExportTable, const CUuuid* pExportTableId);
+typedef CUresult CUDAAPI tcuGetExportTable(const void** ppExportTable, const CUuuid* pExportTableId);
 
 typedef CUresult CUDAAPI tcuGraphicsGLRegisterBuffer(CUgraphicsResource* pCudaResource, GLuint buffer, unsigned int Flags);
 typedef CUresult CUDAAPI tcuGraphicsGLRegisterImage(CUgraphicsResource* pCudaResource, GLuint image, GLenum target, unsigned int Flags);
@@ -961,13 +1089,15 @@ typedef CUresult CUDAAPI tcuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hSt
 
 typedef const char* CUDAAPI tnvrtcGetErrorString(nvrtcResult result);
 typedef nvrtcResult CUDAAPI tnvrtcVersion(int* major, int* minor);
-typedef nvrtcResult CUDAAPI tnvrtcCreateProgram(nvrtcProgram* prog, const char* src, const char* name, int numHeaders, const char* headers, const char* includeNames);
+typedef nvrtcResult CUDAAPI tnvrtcCreateProgram(nvrtcProgram* prog, const char* src, const char* name, int numHeaders, const char** headers, const char** includeNames);
 typedef nvrtcResult CUDAAPI tnvrtcDestroyProgram(nvrtcProgram* prog);
-typedef nvrtcResult CUDAAPI tnvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char* options);
+typedef nvrtcResult CUDAAPI tnvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char** options);
 typedef nvrtcResult CUDAAPI tnvrtcGetPTXSize(nvrtcProgram prog, size_t* ptxSizeRet);
 typedef nvrtcResult CUDAAPI tnvrtcGetPTX(nvrtcProgram prog, char* ptx);
 typedef nvrtcResult CUDAAPI tnvrtcGetProgramLogSize(nvrtcProgram prog, size_t* logSizeRet);
 typedef nvrtcResult CUDAAPI tnvrtcGetProgramLog(nvrtcProgram prog, char* log);
+typedef nvrtcResult CUDAAPI tnvrtcAddNameExpression(nvrtcProgram prog, const char* name_expression);
+typedef nvrtcResult CUDAAPI tnvrtcGetLoweredName(nvrtcProgram prog, const char* name_expression, const char** lowered_name);
 
 
 /* Function declarations. */
@@ -1085,6 +1215,10 @@ extern tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
 extern tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
 extern tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
 extern tcuPointerGetAttribute *cuPointerGetAttribute;
+extern tcuMemPrefetchAsync *cuMemPrefetchAsync;
+extern tcuMemAdvise *cuMemAdvise;
+extern tcuMemRangeGetAttribute *cuMemRangeGetAttribute;
+extern tcuMemRangeGetAttributes *cuMemRangeGetAttributes;
 extern tcuPointerSetAttribute *cuPointerSetAttribute;
 extern tcuPointerGetAttributes *cuPointerGetAttributes;
 extern tcuStreamCreate *cuStreamCreate;
@@ -1103,6 +1237,9 @@ extern tcuEventQuery *cuEventQuery;
 extern tcuEventSynchronize *cuEventSynchronize;
 extern tcuEventDestroy_v2 *cuEventDestroy_v2;
 extern tcuEventElapsedTime *cuEventElapsedTime;
+extern tcuStreamWaitValue32 *cuStreamWaitValue32;
+extern tcuStreamWriteValue32 *cuStreamWriteValue32;
+extern tcuStreamBatchMemOp *cuStreamBatchMemOp;
 extern tcuFuncGetAttribute *cuFuncGetAttribute;
 extern tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
 extern tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
@@ -1132,6 +1269,7 @@ extern tcuTexRefSetMipmapFilterMode *cuTexRefSetMipmapFilterMode;
 extern tcuTexRefSetMipmapLevelBias *cuTexRefSetMipmapLevelBias;
 extern tcuTexRefSetMipmapLevelClamp *cuTexRefSetMipmapLevelClamp;
 extern tcuTexRefSetMaxAnisotropy *cuTexRefSetMaxAnisotropy;
+extern tcuTexRefSetBorderColor *cuTexRefSetBorderColor;
 extern tcuTexRefSetFlags *cuTexRefSetFlags;
 extern tcuTexRefGetAddress_v2 *cuTexRefGetAddress_v2;
 extern tcuTexRefGetArray *cuTexRefGetArray;
@@ -1143,6 +1281,7 @@ extern tcuTexRefGetMipmapFilterMode *cuTexRefGetMipmapFilterMode;
 extern tcuTexRefGetMipmapLevelBias *cuTexRefGetMipmapLevelBias;
 extern tcuTexRefGetMipmapLevelClamp *cuTexRefGetMipmapLevelClamp;
 extern tcuTexRefGetMaxAnisotropy *cuTexRefGetMaxAnisotropy;
+extern tcuTexRefGetBorderColor *cuTexRefGetBorderColor;
 extern tcuTexRefGetFlags *cuTexRefGetFlags;
 extern tcuTexRefCreate *cuTexRefCreate;
 extern tcuTexRefDestroy *cuTexRefDestroy;
@@ -1157,6 +1296,7 @@ extern tcuSurfObjectCreate *cuSurfObjectCreate;
 extern tcuSurfObjectDestroy *cuSurfObjectDestroy;
 extern tcuSurfObjectGetResourceDesc *cuSurfObjectGetResourceDesc;
 extern tcuDeviceCanAccessPeer *cuDeviceCanAccessPeer;
+extern tcuDeviceGetP2PAttribute *cuDeviceGetP2PAttribute;
 extern tcuCtxEnablePeerAccess *cuCtxEnablePeerAccess;
 extern tcuCtxDisablePeerAccess *cuCtxDisablePeerAccess;
 extern tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
@@ -1190,6 +1330,8 @@ extern tnvrtcGetPTXSize *nvrtcGetPTXSize;
 extern tnvrtcGetPTX *nvrtcGetPTX;
 extern tnvrtcGetProgramLogSize *nvrtcGetProgramLogSize;
 extern tnvrtcGetProgramLog *nvrtcGetProgramLog;
+extern tnvrtcAddNameExpression *nvrtcAddNameExpression;
+extern tnvrtcGetLoweredName *nvrtcGetLoweredName;
 
 
 enum {
diff --git a/extern/cuew/src/cuew.c b/extern/cuew/src/cuew.c
index c96ea2c1959..962059bfcce 100644
--- a/extern/cuew/src/cuew.c
+++ b/extern/cuew/src/cuew.c
@@ -184,6 +184,10 @@ tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
 tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
 tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
 tcuPointerGetAttribute *cuPointerGetAttribute;
+tcuMemPrefetchAsync *cuMemPrefetchAsync;
+tcuMemAdvise *cuMemAdvise;
+tcuMemRangeGetAttribute *cuMemRangeGetAttribute;
+tcuMemRangeGetAttributes *cuMemRangeGetAttributes;
 tcuPointerSetAttribute *cuPointerSetAttribute;
 tcuPointerGetAttributes *cuPointerGetAttributes;
 tcuStreamCreate *cuStreamCreate;
@@ -202,6 +206,9 @@ tcuEventQuery *cuEventQuery;
 tcuEventSynchronize *cuEventSynchronize;
 tcuEventDestroy_v2 *cuEventDestroy_v2;
 tcuEventElapsedTime *cuEventElapsedTime;
+tcuStreamWaitValue32 *cuStreamWaitValue32;
+tcuStreamWriteValue32 *cuStreamWriteValue32;
+tcuStreamBatchMemOp *cuStreamBatchMemOp;
 tcuFuncGetAttribute *cuFuncGetAttribute;
 tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
 tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
@@ -231,6 +238,7 @@ tcuTexRefSetMipmapFilterMode *cuTexRefSetMipmapFilterMode;
 tcuTexRefSetMipmapLevelBias *cuTexRefSetMipmapLevelBias;
 tcuTexRefSetMipmapLevelClamp *cuTexRefSetMipmapLevelClamp;
 tcuTexRefSetMaxAnisotropy *cuTexRefSetMaxAnisotropy;
+tcuTexRefSetBorderColor *cuTexRefSetBorderColor;
 tcuTexRefSetFlags *cuTexRefSetFlags;
 tcuTexRefGetAddress_v2 *cuTexRefGetAddress_v2;
 tcuTexRefGetArray *cuTexRefGetArray;
@@ -242,6 +250,7 @@ tcuTexRefGetMipmapFilterMode *cuTexRefGetMipmapFilterMode;
 tcuTexRefGetMipmapLevelBias *cuTexRefGetMipmapLevelBias;
 tcuTexRefGetMipmapLevelClamp *cuTexRefGetMipmapLevelClamp;
 tcuTexRefGetMaxAnisotropy *cuTexRefGetMaxAnisotropy;
+tcuTexRefGetBorderColor *cuTexRefGetBorderColor;
 tcuTexRefGetFlags *cuTexRefGetFlags;
 tcuTexRefCreate *cuTexRefCreate;
 tcuTexRefDestroy *cuTexRefDestroy;
@@ -256,6 +265,7 @@ tcuSurfObjectCreate *cuSurfObjectCreate;
 tcuSurfObjectDestroy *cuSurfObjectDestroy;
 tcuSurfObjectGetResourceDesc *cuSurfObjectGetResourceDesc;
 tcuDeviceCanAccessPeer *cuDeviceCanAccessPeer;
+tcuDeviceGetP2PAttribute *cuDeviceGetP2PAttribute;
 tcuCtxEnablePeerAccess *cuCtxEnablePeerAccess;
 tcuCtxDisablePeerAccess *cuCtxDisablePeerAccess;
 tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
@@ -289,6 +299,8 @@ tnvrtcGetPTXSize *nvrtcGetPTXSize;
 tnvrtcGetPTX *nvrtcGetPTX;
 tnvrtcGetProgramLogSize *nvrtcGetProgramLogSize;
 tnvrtcGetProgramLog *nvrtcGetProgramLog;
+tnvrtcAddNameExpression *nvrtcAddNameExpression;
+tnvrtcGetLoweredName *nvrtcGetLoweredName;
 
 
 static DynamicLibrary dynamic_library_open_find(const char **paths) {
@@ -486,6 +498,10 @@ int cuewInit(void) {
   CUDA_LIBRARY_FIND(cuMipmappedArrayGetLevel);
   CUDA_LIBRARY_FIND(cuMipmappedArrayDestroy);
   CUDA_LIBRARY_FIND(cuPointerGetAttribute);
+  CUDA_LIBRARY_FIND(cuMemPrefetchAsync);
+  CUDA_LIBRARY_FIND(cuMemAdvise);
+  CUDA_LIBRARY_FIND(cuMemRangeGetAttribute);
+  CUDA_LIBRARY_FIND(cuMemRangeGetAttributes);
   CUDA_LIBRARY_FIND(cuPointerSetAttribute);
   CUDA_LIBRARY_FIND(cuPointerGetAttributes);
   CUDA_LIBRARY_FIND(cuStreamCreate);
@@ -504,6 +520,9 @@ int cuewInit(void) {
   CUDA_LIBRARY_FIND(cuEventSynchronize);
   CUDA_LIBRARY_FIND(cuEventDestroy_v2);
   CUDA_LIBRARY_FIND(cuEventElapsedTime);
+  CUDA_LIBRARY_FIND(cuStreamWaitValue32);
+  CUDA_LIBRARY_FIND(cuStreamWriteValue32);
+  CUDA_LIBRARY_FIND(cuStreamBatchMemOp);
   CUDA_LIBRARY_FIND(cuFuncGetAttribute);
   CUDA_LIBRARY_FIND(cuFuncSetCacheConfig);
   CUDA_LIBRARY_FIND(cuFuncSetSharedMemConfig);
@@ -533,6 +552,7 @@ int cuewInit(void) {
   CUDA_LIBRARY_FIND(cuTexRefSetMipmapLevelBias);
   CUDA_LIBRARY_FIND(cuTexRefSetMipmapLevelClamp);
   CUDA_LIBRARY_FIND(cuTexRefSetMaxAnisotropy);
+  CUDA_LIBRARY_FIND(cuTexRefSetBorderColor);
   CUDA_LIBRARY_FIND(cuTexRefSetFlags);
   CUDA_LIBRARY_FIND(cuTexRefGetAddress_v2);
   CUDA_LIBRARY_FIND(cuTexRefGetArray);
@@ -544,6 +564,7 @@ int cuewInit(void) {
   CUDA_LIBRARY_FIND(cuTexRefGetMipmapLevelBias);
   CUDA_LIBRARY_FIND(cuTexRefGetMipmapLevelClamp);
   CUDA_LIBRARY_FIND(cuTexRefGetMaxAnisotropy);
+  CUDA_LIBRARY_FIND(cuTexRefGetBorderColor);
   CUDA_LIBRARY_FIND(cuTexRefGetFlags);
   CUDA_LIBRARY_FIND(cuTexRefCreate);
   CUDA_LIBRARY_FIND(cuTexRefDestroy);
@@ -558,6 +579,7 @@ int cuewInit(void) {
   CUDA_LIBRARY_FIND(cuSurfObjectDestroy);
   CUDA_LIBRARY_FIND(cuSurfObjectGetResourceDesc);
   CUDA_LIBRARY_FIND(cuDeviceCanAccessPeer);
+  CUDA_LIBRARY_FIND(cuDeviceGetP2PAttribute);
   CUDA_LIBRARY_FIND(cuCtxEnablePeerAccess);
   CUDA_LIBRARY_FIND(cuCtxDisablePeerAccess);
   CUDA_LIBRARY_FIND(cuGraphicsUnregisterResource);
@@ -593,6 +615,8 @@ int cuewInit(void) {
     NVRTC_LIBRARY_FIND(nvrtcGetPTX);
     NVRTC_LIBRARY_FIND(nvrtcGetProgramLogSize);
     NVRTC_LIBRARY_FIND(nvrtcGetProgramLog);
+    NVRTC_LIBRARY_FIND(nvrtcAddNameExpression);
+    NVRTC_LIBRARY_FIND(nvrtcGetLoweredName);
   }
 
   result = CUEW_SUCCESS;
@@ -630,6 +654,7 @@ const char *cuewErrorString(CUresult result) {
     case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "Peer access unsupported";
     case CUDA_ERROR_INVALID_PTX: return "Invalid ptx";
     case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: return "Invalid graphics context";
+    case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "Nvlink uncorrectable";
     case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
     case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
     case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
-- 
cgit v1.2.3


From c8a8589d4f1d38fa20d8fe5d4e006bb8d7770bbc Mon Sep 17 00:00:00 2001
From: Bastien Montagne <montagne29@wanadoo.fr>
Date: Tue, 8 Aug 2017 14:06:56 +0200
Subject: Fix T52263: Crash When Splitting and Merging Areas with Header Text
 Set.

Not a regression, but safe enough to be included in 2.79.
---
 source/blender/blenkernel/intern/screen.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/blender/blenkernel/intern/screen.c b/source/blender/blenkernel/intern/screen.c
index 857bd5447c8..df47b89fadc 100644
--- a/source/blender/blenkernel/intern/screen.c
+++ b/source/blender/blenkernel/intern/screen.c
@@ -181,6 +181,7 @@ ARegion *BKE_area_region_copy(SpaceType *st, ARegion *ar)
 	BLI_listbase_clear(&newar->ui_lists);
 	newar->swinid = 0;
 	newar->regiontimer = NULL;
+	newar->headerstr = NULL;
 	
 	/* use optional regiondata callback */
 	if (ar->regiondata) {
-- 
cgit v1.2.3


From d85af2aa3f70ae3b95d10931323a916bfa1b8ebe Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 8 Aug 2017 14:26:46 +0200
Subject: Update CUEW to latest version

Previous update pulled too much of system-wide typedefs.
---
 extern/cuew/README.blender |  2 +-
 extern/cuew/include/cuew.h | 39 +++++++++------------------------------
 2 files changed, 10 insertions(+), 31 deletions(-)

diff --git a/extern/cuew/README.blender b/extern/cuew/README.blender
index ef36c110e3f..a53a927c25f 100644
--- a/extern/cuew/README.blender
+++ b/extern/cuew/README.blender
@@ -1,5 +1,5 @@
 Project: Cuda Wrangler
 URL: https://github.com/CudaWrangler/cuew
 License: Apache 2.0
-Upstream version: 3dd0b01
+Upstream version: cbf465b
 Local modifications: None
diff --git a/extern/cuew/include/cuew.h b/extern/cuew/include/cuew.h
index c90ab39601a..0eace96bc3f 100644
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -116,6 +116,15 @@ extern "C" {
 #define cuGLGetDevices cuGLGetDevices_v2
 
 /* Types. */
+#ifdef _MSC_VER
+typedef unsigned __int32 cuuint32_t;
+typedef unsigned __int64 cuuint64_t;
+#else
+#include <stdint.h>
+typedef uint32_t cuuint32_t;
+typedef uint64_t cuuint64_t;
+#endif
+
 #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__)
 typedef unsigned long long CUdeviceptr;
 #else
@@ -131,36 +140,6 @@ typedef unsigned int CUdeviceptr;
 #  define CUDA_CB
 #endif
 
-typedef signed char int8_t;
-typedef short int int16_t;
-typedef int int32_t;
-typedef long int int64_t;
-typedef unsigned char uint8_t;
-typedef unsigned short int uint16_t;
-typedef unsigned int uint32_t;
-typedef unsigned long int uint64_t;
-typedef signed char int_least8_t;
-typedef short int int_least16_t;
-typedef int int_least32_t;
-typedef long int int_least64_t;
-typedef unsigned char uint_least8_t;
-typedef unsigned short int uint_least16_t;
-typedef unsigned int uint_least32_t;
-typedef unsigned long int uint_least64_t;
-typedef signed char int_fast8_t;
-typedef long int int_fast16_t;
-typedef long int int_fast32_t;
-typedef long int int_fast64_t;
-typedef unsigned char uint_fast8_t;
-typedef unsigned long int uint_fast16_t;
-typedef unsigned long int uint_fast32_t;
-typedef unsigned long int uint_fast64_t;
-typedef long int intptr_t;
-typedef unsigned long int uintptr_t;
-typedef long int intmax_t;
-typedef unsigned long int uintmax_t;
-typedef uint32_t cuuint32_t;
-typedef uint64_t cuuint64_t;
 typedef int CUdevice;
 typedef struct CUctx_st* CUcontext;
 typedef struct CUmod_st* CUmodule;
-- 
cgit v1.2.3


From ed4707be470b61edef3a14c239838f91d8624269 Mon Sep 17 00:00:00 2001
From: Aleksandr Zinovev <roaoao@gmail.com>
Date: Tue, 8 Aug 2017 15:52:14 +0300
Subject: Fix width estimation for empty layouts in pie menus

---
 source/blender/editors/interface/interface_layout.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/source/blender/editors/interface/interface_layout.c b/source/blender/editors/interface/interface_layout.c
index c57faf97188..dc2b97acf65 100644
--- a/source/blender/editors/interface/interface_layout.c
+++ b/source/blender/editors/interface/interface_layout.c
@@ -3137,8 +3137,11 @@ static void ui_item_estimate(uiItem *item)
 		for (subitem = litem->items.first; subitem; subitem = subitem->next)
 			ui_item_estimate(subitem);
 
-		if (BLI_listbase_is_empty(&litem->items))
+		if (BLI_listbase_is_empty(&litem->items)) {
+			litem->w = 0;
+			litem->h = 0;
 			return;
+		}
 
 		if (litem->scale[0] != 0.0f || litem->scale[1] != 0.0f)
 			ui_item_scale(litem, litem->scale);
-- 
cgit v1.2.3


From 01ee88563b64e8f381bbbe9f54894ad27541f050 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sybren=20A=2E=20St=C3=BCvel?= <sybren@stuvel.eu>
Date: Tue, 8 Aug 2017 15:13:54 +0200
Subject: Fix T46329: scene_update_{pre,post} doc needs clarification

The documentation for the bpy.app.handlers.scene_update_{pre,post}
handlers states that they're called "on updating the scenes data".
However, they're called even when the data hasn't changed. Of course
such handlers are useful, but the documentation should reflect the
current behaviour.

Reviewers: mont29, sergey

Subscribers: Blendify

Maniphest Tasks: T46329

Differential Revision: https://developer.blender.org/D1535
---
 source/blender/python/intern/bpy_app_handlers.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/source/blender/python/intern/bpy_app_handlers.c b/source/blender/python/intern/bpy_app_handlers.c
index fdc2371c259..90aa22de5bf 100644
--- a/source/blender/python/intern/bpy_app_handlers.c
+++ b/source/blender/python/intern/bpy_app_handlers.c
@@ -59,8 +59,12 @@ static PyStructSequence_Field app_cb_info_fields[] = {
 	{(char *)"load_post",         (char *)"on loading a new blend file (after)"},
 	{(char *)"save_pre",          (char *)"on saving a blend file (before)"},
 	{(char *)"save_post",         (char *)"on saving a blend file (after)"},
-	{(char *)"scene_update_pre",  (char *)"on updating the scenes data (before)"},
-	{(char *)"scene_update_post", (char *)"on updating the scenes data (after)"},
+	{(char *)"scene_update_pre",  (char *)"on every scene data update. Does not imply that anything changed in the "
+                                          "scene, just that the dependency graph is about to be reevaluated, and the "
+                                          "scene is about to be updated by Blender's animation system."},
+	{(char *)"scene_update_post",  (char *)"on every scene data update. Does not imply that anything changed in the "
+                                           "scene, just that the dependency graph was reevaluated, and the scene was "
+                                           "possibly updated by Blender's animation system."},
 	{(char *)"game_pre",          (char *)"on starting the game engine"},
 	{(char *)"game_post",         (char *)"on ending the game engine"},
 	{(char *)"version_update",    (char *)"on ending the versioning code"},
-- 
cgit v1.2.3


From fd397a7d28666f1c3ee9cd85de5760825d62d28e Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 8 Aug 2017 14:34:59 +0200
Subject: Cycles: Add utility macro ccl_ref

It is defined to & for CPU side compilation, and defined to an empty for any GPU
platform. The idea here is to use this macro instead of #ifdef block with bunch
of duplicated lines just to make it so CPU code is efficient.

Eventually we might switch to references on CUDA as well, but that would require
some intensive testing.
---
 intern/cycles/kernel/kernel_compat_cuda.h   | 4 ++++
 intern/cycles/kernel/kernel_compat_opencl.h | 1 +
 intern/cycles/util/util_defines.h           | 1 +
 3 files changed, 6 insertions(+)

diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 38708f7ff0b..6d1cf055f2c 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -53,6 +53,10 @@
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
+/* TODO(sergey): In theory we might use references with CUDA, however
+ * performance impact yet to be investigated.
+ */
+#define ccl_ref
 #define ccl_align(n) __align__(n)
 
 #define ATTR_FALLTHROUGH
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 21eba971688..36d6031d042 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -42,6 +42,7 @@
 #define ccl_local_param __local
 #define ccl_private __private
 #define ccl_restrict restrict
+#define ccl_ref
 #define ccl_align(n) __attribute__((aligned(n)))
 
 #ifdef __SPLIT_KERNEL__
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
index d0d87e74332..ae654092c87 100644
--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -35,6 +35,7 @@
 #  define ccl_local_param
 #  define ccl_private
 #  define ccl_restrict __restrict
+#  define ccl_ref &
 #  define __KERNEL_WITH_SSE_ALIGN__
 
 #  if defined(_WIN32) && !defined(FREE_WINDOWS)
-- 
cgit v1.2.3


From 19d19add1eb8697fef24171d1170bdce98a5e6cd Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 8 Aug 2017 14:39:00 +0200
Subject: Cycles: Cleanup, de-duplicate function parameter list

Was only needed to sue const reference on CPU. Now it is done using ccl_ref.
---
 intern/cycles/kernel/geom/geom_curve_intersect.h | 42 ++++++++----------------
 1 file changed, 14 insertions(+), 28 deletions(-)

diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index 8378e002506..e9a149ea1ab 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -31,34 +31,20 @@ ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
 }
 #endif
 
-#ifdef __KERNEL_SSE2__
-/* Pass P and dir by reference to aligned vector */
-ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
-                                                        Intersection *isect,
-                                                        const float3 &P,
-                                                        const float3 &dir,
-                                                        uint visibility,
-                                                        int object,
-                                                        int curveAddr,
-                                                        float time,
-                                                        int type,
-                                                        uint *lcg_state,
-                                                        float difl,
-                                                        float extmax)
-#else
-ccl_device_curveintersect bool cardinal_curve_intersect(KernelGlobals *kg,
-                                                        Intersection *isect,
-                                                        float3 P,
-                                                        float3 dir,
-                                                        uint visibility,
-                                                        int object,
-                                                        int curveAddr,
-                                                        float time,
-                                                        int type,
-                                                        uint *lcg_state,
-                                                        float difl,
-                                                        float extmax)
-#endif
+/* On CPU pass P and dir by reference to aligned vector. */
+ccl_device_curveintersect bool cardinal_curve_intersect(
+        KernelGlobals *kg,
+        Intersection *isect,
+        const float3 ccl_ref P,
+        const float3 ccl_ref dir,
+        uint visibility,
+        int object,
+        int curveAddr,
+        float time,
+        int type,
+        uint *lcg_state,
+        float difl,
+        float extmax)
 {
 	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
 
-- 
cgit v1.2.3


From 126830b56cbb8e270c3bcffd14ee9d1ef2cecb0d Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 8 Aug 2017 15:24:35 +0200
Subject: Depsgraph: Cleanup, make it easier to debug on laptop

---
 .../intern/builder/deg_builder_relations.cc        | 145 +++++++++++++--------
 1 file changed, 92 insertions(+), 53 deletions(-)

diff --git a/source/blender/depsgraph/intern/builder/deg_builder_relations.cc b/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
index a1abcb96411..0726035b04e 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
@@ -927,16 +927,20 @@ void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 
 	const char *rna_path = fcu->rna_path ? fcu->rna_path : "";
 
-	/* create dependency between driver and data affected by it */
+	/* Create dependency between driver and data affected by it. */
 	/* - direct property relationship... */
 	//RNAPathKey affected_key(id, fcu->rna_path);
 	//add_relation(driver_key, affected_key, "[Driver -> Data] DepsRel");
 
-	/* driver -> data components (for interleaved evaluation - bones/constraints/modifiers) */
-	// XXX: this probably should probably be moved out into a separate function
+	/* Driver -> data components (for interleaved evaluation
+	 * bones/constraints/modifiers).
+	 */
+	// XXX: this probably should probably be moved out into a separate function.
 	if (strstr(rna_path, "pose.bones[") != NULL) {
 		/* interleaved drivers during bone eval */
-		// TODO: ideally, if this is for a constraint, it goes to said constraint
+		/* TODO: ideally, if this is for a constraint, it goes to said
+		 * constraint.
+		 */
 		Object *ob = (Object *)id;
 		char *bone_name;
 
@@ -949,7 +953,10 @@ void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 		}
 
 		if (pchan) {
-			OperationKey bone_key(id, DEG_NODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_LOCAL);
+			OperationKey bone_key(id,
+			                      DEG_NODE_TYPE_BONE,
+			                      pchan->name,
+			                      DEG_OPCODE_BONE_LOCAL);
 			add_relation(driver_key, bone_key, "[Driver -> Bone]");
 		}
 		else {
@@ -959,30 +966,36 @@ void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 		}
 	}
 	else if (GS(id->name) == ID_AR && strstr(rna_path, "bones[")) {
-		/* drivers on armature-level bone settings (i.e. bbone stuff),
-		 * which will affect the evaluation of corresponding pose bones
+		/* Drivers on armature-level bone settings (i.e. bbone stuff),
+		 * which will affect the evaluation of corresponding pose bones.
 		 */
 		IDDepsNode *arm_node = m_graph->find_id_node(id);
 		char *bone_name = BLI_str_quoted_substrN(rna_path, "bones[");
 
 		if (arm_node && bone_name) {
-			/* find objects which use this, and make their eval callbacks depend on this */
+			/* Find objects which use this, and make their eval callbacks
+			 * depend on this.
+			 */
 			foreach (DepsRelation *rel, arm_node->outlinks) {
 				IDDepsNode *to_node = (IDDepsNode *)rel->to;
-
-				/* we only care about objects with pose data which use this... */
+				/* We only care about objects with pose data which use this. */
 				if (GS(to_node->id->name) == ID_OB) {
 					Object *ob = (Object *)to_node->id;
-					bPoseChannel *pchan = BKE_pose_channel_find_name(ob->pose, bone_name); // NOTE: ob->pose may be NULL
-
-					if (pchan) {
-						OperationKey bone_key(&ob->id, DEG_NODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_LOCAL);
-						add_relation(driver_key, bone_key, "[Arm Bone -> Driver -> Bone]");
+					/* NOTE: ob->pose may be NULL. */
+					bPoseChannel *pchan = BKE_pose_channel_find_name(
+					        ob->pose, bone_name);
+					if (pchan != NULL) {
+						OperationKey bone_key(&ob->id,
+						                      DEG_NODE_TYPE_BONE,
+						                      pchan->name,
+						                      DEG_OPCODE_BONE_LOCAL);
+						add_relation(driver_key,
+						             bone_key,
+						             "[Arm Bone -> Driver -> Bone]");
 					}
 				}
 			}
-
-			/* free temp data */
+			/* Free temp data. */
 			MEM_freeN(bone_name);
 			bone_name = NULL;
 		}
@@ -993,7 +1006,9 @@ void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 		}
 	}
 	else if (GS(id->name) == ID_OB && strstr(rna_path, "modifiers[")) {
-		OperationKey modifier_key(id, DEG_NODE_TYPE_GEOMETRY, DEG_OPCODE_GEOMETRY_UBEREVAL);
+		OperationKey modifier_key(id,
+		                          DEG_NODE_TYPE_GEOMETRY,
+		                          DEG_OPCODE_GEOMETRY_UBEREVAL);
 		if (has_node(modifier_key)) {
 			add_relation(driver_key, modifier_key, "[Driver -> Modifier]");
 		}
@@ -1002,7 +1017,7 @@ void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 		}
 	}
 	else if (GS(id->name) == ID_KE && strstr(rna_path, "key_blocks[")) {
-		/* shape key driver - hook into the base geometry operation */
+		/* Shape key driver - hook into the base geometry operation. */
 		// XXX: double check where this points
 		Key *shape_key = (Key *)id;
 
@@ -1016,33 +1031,44 @@ void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 	else {
 		if (GS(id->name) == ID_OB) {
 			/* assume that driver affects a transform... */
-			OperationKey local_transform_key(id, DEG_NODE_TYPE_TRANSFORM, DEG_OPCODE_TRANSFORM_LOCAL);
-			add_relation(driver_key, local_transform_key, "[Driver -> Transform]");
+			OperationKey local_transform_key(id,
+			                                 DEG_NODE_TYPE_TRANSFORM,
+			                                 DEG_OPCODE_TRANSFORM_LOCAL);
+			add_relation(driver_key,
+			             local_transform_key,
+			             "[Driver -> Transform]");
 		}
 		else if (GS(id->name) == ID_KE) {
 			ComponentKey geometry_key(id, DEG_NODE_TYPE_GEOMETRY);
-			add_relation(driver_key, geometry_key, "[Driver -> Shapekey Geometry]");
+			add_relation(driver_key,
+			             geometry_key,
+			             "[Driver -> Shapekey Geometry]");
 		}
 	}
-
-	/* ensure that affected prop's update callbacks will be triggered once done */
-	// TODO: implement this once the functionality to add these links exists in RNA
-	// XXX: the data itself could also set this, if it were to be truly initialised later?
-
-	/* loop over variables to get the target relationships */
+	/* Ensure that affected prop's update callbacks will be triggered once
+	 * done.
+	 */
+	/* TODO: Implement this once the functionality to add these links exists
+	 * RNA.
+	 */
+	/* XXX: the data itself could also set this, if it were to be truly
+	 * initialised later?
+	 */
+	/* Loop over variables to get the target relationships. */
 	LINKLIST_FOREACH (DriverVar *, dvar, &driver->variables) {
-		/* only used targets */
+		/* Only used targets. */
 		DRIVER_TARGETS_USED_LOOPER(dvar)
 		{
-			if (dtar->id == NULL)
+			if (dtar->id == NULL) {
 				continue;
-
-			/* special handling for directly-named bones */
+			}
+			/* Special handling for directly-named bones. */
 			if ((dtar->flag & DTAR_FLAG_STRUCT_REF) && (dtar->pchan_name[0])) {
 				Object *ob = (Object *)dtar->id;
-				bPoseChannel *target_pchan = BKE_pose_channel_find_name(ob->pose, dtar->pchan_name);
+				bPoseChannel *target_pchan =
+				        BKE_pose_channel_find_name(ob->pose, dtar->pchan_name);
 				if (target_pchan != NULL) {
-					/* get node associated with bone */
+					/* Get node associated with bone. */
 					// XXX: watch the space!
 					/* Some cases can't use final bone transform, for example:
 					 * - Driving the bone with itself (addressed here)
@@ -1054,55 +1080,68 @@ void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 					{
 						continue;
 					}
-					OperationKey target_key(dtar->id, DEG_NODE_TYPE_BONE, target_pchan->name, DEG_OPCODE_BONE_DONE);
-					add_relation(target_key, driver_key, "[Bone Target -> Driver]");
+					OperationKey target_key(dtar->id,
+					                        DEG_NODE_TYPE_BONE,
+					                        target_pchan->name,
+					                        DEG_OPCODE_BONE_DONE);
+					add_relation(target_key,
+					             driver_key,
+					             "[Bone Target -> Driver]");
 				}
 			}
 			else if (dtar->flag & DTAR_FLAG_STRUCT_REF) {
-				/* get node associated with the object's transforms */
-				OperationKey target_key(dtar->id, DEG_NODE_TYPE_TRANSFORM, DEG_OPCODE_TRANSFORM_FINAL);
+				/* Get node associated with the object's transforms. */
+				OperationKey target_key(dtar->id,
+				                        DEG_NODE_TYPE_TRANSFORM,
+				                        DEG_OPCODE_TRANSFORM_FINAL);
 				add_relation(target_key, driver_key, "[Target -> Driver]");
 			}
 			else if (dtar->rna_path && strstr(dtar->rna_path, "pose.bones[")) {
-				/* workaround for ensuring that local bone transforms don't end up
-				 * having to wait for pose eval to finish (to prevent cycles)
+				/* Workaround for ensuring that local bone transforms don't end
+				 * up having to wait for pose eval to finish (to prevent cycles).
 				 */
 				Object *ob = (Object *)dtar->id;
-				char *bone_name = BLI_str_quoted_substrN(dtar->rna_path, "pose.bones[");
-				bPoseChannel *target_pchan = BKE_pose_channel_find_name(ob->pose, bone_name);
-				if (bone_name) {
+				char *bone_name = BLI_str_quoted_substrN(dtar->rna_path,
+				                                         "pose.bones[");
+				bPoseChannel *target_pchan =
+				        BKE_pose_channel_find_name(ob->pose, bone_name);
+				if (bone_name != NULL) {
 					MEM_freeN(bone_name);
 					bone_name = NULL;
 				}
-				if (target_pchan) {
+				if (target_pchan != NULL) {
 					if (dtar->id == id &&
 					    pchan != NULL &&
 					    STREQ(pchan->name, target_pchan->name))
 					{
 						continue;
 					}
-					OperationKey bone_key(dtar->id, DEG_NODE_TYPE_BONE, target_pchan->name, DEG_OPCODE_BONE_LOCAL);
+					OperationKey bone_key(dtar->id,
+					                      DEG_NODE_TYPE_BONE,
+					                      target_pchan->name,
+					                      DEG_OPCODE_BONE_LOCAL);
 					add_relation(bone_key, driver_key, "[RNA Bone -> Driver]");
 				}
 			}
 			else {
 				if (dtar->id == id) {
-					/* Ignore input dependency if we're driving properties of the same ID,
-					 * otherwise we'll be ending up in a cyclic dependency here.
+					/* Ignore input dependency if we're driving properties of
+					 * the same ID, otherwise we'll be ending up in a cyclic
+					 * dependency here.
 					 */
 					continue;
 				}
-				/* resolve path to get node */
-				RNAPathKey target_key(dtar->id, dtar->rna_path ? dtar->rna_path : "");
+				/* Resolve path to get node. */
+				RNAPathKey target_key(dtar->id,
+				                      dtar->rna_path ? dtar->rna_path : "");
 				add_relation(target_key, driver_key, "[RNA Target -> Driver]");
 			}
 		}
 		DRIVER_TARGETS_LOOPER_END
 	}
-
-	/* It's quite tricky to detect if the driver actually depends on time or not,
-	 * so for now we'll be quite conservative here about optimization and consider
-	 * all python drivers to be depending on time.
+	/* It's quite tricky to detect if the driver actually depends on time or
+	 * not, so for now we'll be quite conservative here about optimization and
+	 * consider all python drivers to be depending on time.
 	 */
 	if ((driver->type == DRIVER_TYPE_PYTHON) &&
 	    python_driver_depends_on_time(driver))
-- 
cgit v1.2.3


From 90b3dba5151d1297929ec08baf3800755e9e3494 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 8 Aug 2017 15:27:11 +0200
Subject: Fix T52255: New Depsgraph - Constraint and Drivers not working
 together when the driver references itself

---
 source/blender/depsgraph/intern/builder/deg_builder_relations.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/source/blender/depsgraph/intern/builder/deg_builder_relations.cc b/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
index 0726035b04e..bb8e2a710ef 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
@@ -1091,6 +1091,13 @@ void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 			}
 			else if (dtar->flag & DTAR_FLAG_STRUCT_REF) {
 				/* Get node associated with the object's transforms. */
+				if (dtar->id == id) {
+					/* Ignore input dependency if we're driving properties of
+					 * the same ID, otherwise we'll be ending up in a cyclic
+					 * dependency here.
+					 */
+					continue;
+				}
 				OperationKey target_key(dtar->id,
 				                        DEG_NODE_TYPE_TRANSFORM,
 				                        DEG_OPCODE_TRANSFORM_FINAL);
-- 
cgit v1.2.3


From a78b596272711c6f591178ac05bd191dac449e4f Mon Sep 17 00:00:00 2001
From: Bastien Montagne <montagne29@wanadoo.fr>
Date: Tue, 8 Aug 2017 16:01:48 +0200
Subject: Fix T52260: Blender 2.79 Objects made duplicates real still refer
 armature proxy.

New code was handling correctly ID's internal references to self, but
not references between 'made real' different objects...

Regression, to be backported in 2.79.
---
 source/blender/editors/object/object_add.c | 75 ++++++++++++------------------
 1 file changed, 31 insertions(+), 44 deletions(-)

diff --git a/source/blender/editors/object/object_add.c b/source/blender/editors/object/object_add.c
index 4ed1e85fb48..a925abaecbc 100644
--- a/source/blender/editors/object/object_add.c
+++ b/source/blender/editors/object/object_add.c
@@ -1354,15 +1354,13 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
 
 	lb = object_duplilist(bmain->eval_ctx, scene, base->object);
 
-	if (use_hierarchy || use_base_parent) {
-		dupli_gh = BLI_ghash_ptr_new(__func__);
-		if (use_hierarchy) {
-			if (base->object->transflag & OB_DUPLIGROUP) {
-				parent_gh = BLI_ghash_new(dupliobject_group_hash, dupliobject_group_cmp, __func__);
-			}
-			else {
-				parent_gh = BLI_ghash_new(dupliobject_hash, dupliobject_cmp, __func__);
-			}
+	dupli_gh = BLI_ghash_ptr_new(__func__);
+	if (use_hierarchy) {
+		if (base->object->transflag & OB_DUPLIGROUP) {
+			parent_gh = BLI_ghash_new(dupliobject_group_hash, dupliobject_group_cmp, __func__);
+		}
+		else {
+			parent_gh = BLI_ghash_new(dupliobject_hash, dupliobject_cmp, __func__);
 		}
 	}
 
@@ -1400,9 +1398,7 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
 		copy_m4_m4(ob->obmat, dob->mat);
 		BKE_object_apply_mat4(ob, ob->obmat, false, false);
 
-		if (dupli_gh) {
-			BLI_ghash_insert(dupli_gh, dob, ob);
-		}
+		BLI_ghash_insert(dupli_gh, dob, ob);
 		if (parent_gh) {
 			void **val;
 			/* Due to nature of hash/comparison of this ghash, a lot of duplis may be considered as 'the same',
@@ -1411,22 +1407,21 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
 				*val = ob;
 			}
 		}
+	}
+
+	for (dob = lb->first; dob; dob = dob->next) {
+		Object *ob_src = dob->ob;
+		Object *ob_dst = BLI_ghash_lookup(dupli_gh, dob);
 
 		/* Remap new object to itself, and clear again newid pointer of orig object. */
-		BKE_libblock_relink_to_newid(&ob->id);
-		set_sca_new_poins_ob(ob);
-		BKE_id_clear_newpoin(&dob->ob->id);
+		BKE_libblock_relink_to_newid(&ob_dst->id);
+		set_sca_new_poins_ob(ob_dst);
 
-		DAG_id_tag_update(&ob->id, OB_RECALC_DATA);
-	}
+		DAG_id_tag_update(&ob_dst->id, OB_RECALC_DATA);
 
-	if (use_hierarchy) {
-		for (dob = lb->first; dob; dob = dob->next) {
+		if (use_hierarchy) {
 			/* original parents */
-			Object *ob_src =     dob->ob;
 			Object *ob_src_par = ob_src->parent;
-
-			Object *ob_dst =     BLI_ghash_lookup(dupli_gh, dob);
 			Object *ob_dst_par = NULL;
 
 			/* find parent that was also made real */
@@ -1437,8 +1432,8 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
 				dob_key.ob = ob_src_par;
 				if (base->object->transflag & OB_DUPLIGROUP) {
 					memcpy(&dob_key.persistent_id[1],
-						   &dob->persistent_id[1],
-						   sizeof(dob->persistent_id[1]) * (MAX_DUPLI_RECUR - 1));
+					       &dob->persistent_id[1],
+					       sizeof(dob->persistent_id[1]) * (MAX_DUPLI_RECUR - 1));
 				}
 				else {
 					dob_key.persistent_id[0] = dob->persistent_id[0];
@@ -1462,29 +1457,20 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
 				ob_dst->parent = base->object;
 				ob_dst->partype = PAROBJECT;
 			}
-
-			if (ob_dst->parent) {
-				/* note, this may be the parent of other objects, but it should
-				 * still work out ok */
-				BKE_object_apply_mat4(ob_dst, dob->mat, false, true);
-
-				/* to set ob_dst->orig and in case theres any other discrepicies */
-				DAG_id_tag_update(&ob_dst->id, OB_RECALC_OB);
-			}
 		}
-	}
-	else if (use_base_parent) {
-		/* since we are ignoring the internal hierarchy - parent all to the
-		 * base object */
-		for (dob = lb->first; dob; dob = dob->next) {
-			/* original parents */
-			Object *ob_dst = BLI_ghash_lookup(dupli_gh, dob);
-
+		else if (use_base_parent) {
+			/* since we are ignoring the internal hierarchy - parent all to the
+			 * base object */
 			ob_dst->parent = base->object;
 			ob_dst->partype = PAROBJECT;
+		}
 
-			/* similer to the code above, see comments */
+		if (ob_dst->parent) {
+			/* note, this may be the parent of other objects, but it should
+			 * still work out ok */
 			BKE_object_apply_mat4(ob_dst, dob->mat, false, true);
+
+			/* to set ob_dst->orig and in case theres any other discrepicies */
 			DAG_id_tag_update(&ob_dst->id, OB_RECALC_OB);
 		}
 	}
@@ -1499,13 +1485,14 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
 		}
 	}
 
-	if (dupli_gh)
-		BLI_ghash_free(dupli_gh, NULL, NULL);
+	BLI_ghash_free(dupli_gh, NULL, NULL);
 	if (parent_gh)
 		BLI_ghash_free(parent_gh, NULL, NULL);
 
 	free_object_duplilist(lb);
 
+	BKE_main_id_clear_newpoins(bmain);
+
 	base->object->transflag &= ~OB_DUPLI;
 }
 
-- 
cgit v1.2.3


From ddfd57c0d275e6222f9622379b4c716e868029ce Mon Sep 17 00:00:00 2001
From: Bastien Montagne <montagne29@wanadoo.fr>
Date: Tue, 8 Aug 2017 16:08:24 +0200
Subject: Cleanup: mostly namings in `make_object_duplilist_real` code.

---
 source/blender/editors/object/object_add.c | 78 ++++++++++++++++--------------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/source/blender/editors/object/object_add.c b/source/blender/editors/object/object_add.c
index a925abaecbc..e4f6d0da38f 100644
--- a/source/blender/editors/object/object_add.c
+++ b/source/blender/editors/object/object_add.c
@@ -1344,15 +1344,15 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
                                        const bool use_hierarchy)
 {
 	Main *bmain = CTX_data_main(C);
-	ListBase *lb;
+	ListBase *lb_duplis;
 	DupliObject *dob;
-	GHash *dupli_gh = NULL, *parent_gh = NULL;
-	Object *object;
+	GHash *dupli_gh, *parent_gh = NULL;
 
-	if (!(base->object->transflag & OB_DUPLI))
+	if (!(base->object->transflag & OB_DUPLI)) {
 		return;
+	}
 
-	lb = object_duplilist(bmain->eval_ctx, scene, base->object);
+	lb_duplis = object_duplilist(bmain->eval_ctx, scene, base->object);
 
 	dupli_gh = BLI_ghash_ptr_new(__func__);
 	if (use_hierarchy) {
@@ -1364,52 +1364,55 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
 		}
 	}
 
-	for (dob = lb->first; dob; dob = dob->next) {
-		Base *basen;
-		Object *ob = ID_NEW_SET(dob->ob, BKE_object_copy(bmain, dob->ob));
+	for (dob = lb_duplis->first; dob; dob = dob->next) {
+		Object *ob_src = dob->ob;
+		Object *ob_dst = ID_NEW_SET(dob->ob, BKE_object_copy(bmain, ob_src));
+		Base *base_dst;
 
 		/* font duplis can have a totcol without material, we get them from parent
 		 * should be implemented better...
 		 */
-		if (ob->mat == NULL) ob->totcol = 0;
+		if (ob_dst->mat == NULL) {
+			ob_dst->totcol = 0;
+		}
 
-		basen = MEM_dupallocN(base);
-		basen->flag &= ~(OB_FROMDUPLI | OB_FROMGROUP);
-		ob->flag = basen->flag;
-		basen->lay = base->lay;
-		BLI_addhead(&scene->base, basen);   /* addhead: othwise eternal loop */
-		basen->object = ob;
+		base_dst = MEM_dupallocN(base);
+		base_dst->flag &= ~(OB_FROMDUPLI | OB_FROMGROUP);
+		ob_dst->flag = base_dst->flag;
+		base_dst->lay = base->lay;
+		BLI_addhead(&scene->base, base_dst);   /* addhead: othwise eternal loop */
+		base_dst->object = ob_dst;
 
 		/* make sure apply works */
-		BKE_animdata_free(&ob->id, true);
-		ob->adt = NULL;
+		BKE_animdata_free(&ob_dst->id, true);
+		ob_dst->adt = NULL;
 
 		/* Proxies are not to be copied. */
-		ob->proxy_from = NULL;
-		ob->proxy_group = NULL;
-		ob->proxy = NULL;
+		ob_dst->proxy_from = NULL;
+		ob_dst->proxy_group = NULL;
+		ob_dst->proxy = NULL;
 
-		ob->parent = NULL;
-		BKE_constraints_free(&ob->constraints);
-		ob->curve_cache = NULL;
-		ob->transflag &= ~OB_DUPLI;
-		ob->lay = base->lay;
+		ob_dst->parent = NULL;
+		BKE_constraints_free(&ob_dst->constraints);
+		ob_dst->curve_cache = NULL;
+		ob_dst->transflag &= ~OB_DUPLI;
+		ob_dst->lay = base->lay;
 
-		copy_m4_m4(ob->obmat, dob->mat);
-		BKE_object_apply_mat4(ob, ob->obmat, false, false);
+		copy_m4_m4(ob_dst->obmat, dob->mat);
+		BKE_object_apply_mat4(ob_dst, ob_dst->obmat, false, false);
 
-		BLI_ghash_insert(dupli_gh, dob, ob);
+		BLI_ghash_insert(dupli_gh, dob, ob_dst);
 		if (parent_gh) {
 			void **val;
 			/* Due to nature of hash/comparison of this ghash, a lot of duplis may be considered as 'the same',
 			 * this avoids trying to insert same key several time and raise asserts in debug builds... */
 			if (!BLI_ghash_ensure_p(parent_gh, dob, &val)) {
-				*val = ob;
+				*val = ob_dst;
 			}
 		}
 	}
 
-	for (dob = lb->first; dob; dob = dob->next) {
+	for (dob = lb_duplis->first; dob; dob = dob->next) {
 		Object *ob_src = dob->ob;
 		Object *ob_dst = BLI_ghash_lookup(dupli_gh, dob);
 
@@ -1476,20 +1479,21 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
 	}
 
 	if (base->object->transflag & OB_DUPLIGROUP && base->object->dup_group) {
-		for (object = bmain->object.first; object; object = object->id.next) {
-			if (object->proxy_group == base->object) {
-				object->proxy = NULL;
-				object->proxy_from = NULL;
-				DAG_id_tag_update(&object->id, OB_RECALC_OB);
+		for (Object *ob = bmain->object.first; ob; ob = ob->id.next) {
+			if (ob->proxy_group == base->object) {
+				ob->proxy = NULL;
+				ob->proxy_from = NULL;
+				DAG_id_tag_update(&ob->id, OB_RECALC_OB);
 			}
 		}
 	}
 
 	BLI_ghash_free(dupli_gh, NULL, NULL);
-	if (parent_gh)
+	if (parent_gh) {
 		BLI_ghash_free(parent_gh, NULL, NULL);
+	}
 
-	free_object_duplilist(lb);
+	free_object_duplilist(lb_duplis);
 
 	BKE_main_id_clear_newpoins(bmain);
 
-- 
cgit v1.2.3