4 files changed, 83 insertions, 21 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 676b1279a80..275ee028eb4 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -213,12 +213,7 @@ public:
 				return;
 		}
 
-		KernelGlobals kg = kernel_globals;
-
-#ifdef WITH_OSL
-		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
-
+		KernelGlobals kg = thread_kernel_globals_init();
 		RenderTile tile;
 
 		void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
@@ -289,9 +284,7 @@ public:
 			}
 		}
 
-#ifdef WITH_OSL
-		OSLShader::thread_free(&kg);
-#endif
+		thread_kernel_globals_free(&kg);
 	}
 
 	void thread_film_convert(DeviceTask& task)
@@ -481,6 +474,40 @@ public:
 	{
 		task_pool.cancel();
 	}
+
+protected:
+	inline KernelGlobals thread_kernel_globals_init()
+	{
+		KernelGlobals kg = kernel_globals;
+		kg.transparent_shadow_intersections = NULL;
+		const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
+		                            sizeof(*kg.decoupled_volume_steps);
+		for(int i = 0; i < decoupled_count; ++i) {
+			kg.decoupled_volume_steps[i] = NULL;
+		}
+		kg.decoupled_volume_steps_index = 0;
+#ifdef WITH_OSL
+		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
+#endif
+		return kg;
+	}
+
+	inline void thread_kernel_globals_free(KernelGlobals *kg)
+	{
+		if(kg->transparent_shadow_intersections != NULL) {
+			free(kg->transparent_shadow_intersections);
+		}
+		const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
+		                            sizeof(*kg->decoupled_volume_steps);
+		for(int i = 0; i < decoupled_count; ++i) {
+			if(kg->decoupled_volume_steps[i] != NULL) {
+				free(kg->decoupled_volume_steps[i]);
+			}
+		}
+#ifdef WITH_OSL
+		OSLShader::thread_free(kg);
+#endif
+	}
 };
 
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index c44ea1b051f..7e6cdf93fb9 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -31,6 +31,9 @@ struct OSLThreadData;
 struct OSLShadingSystem;
 #  endif
 
+struct Intersection;
+struct VolumeStep;
+
 typedef struct KernelGlobals {
 	texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_IMAGES_CPU];
 	texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_IMAGES_CPU];
@@ -51,6 +54,14 @@ typedef struct KernelGlobals {
 	OSLThreadData *osl_tdata;
 #  endif
 
+	/* **** Run-time data ****  */
+
+	/* Heap-allocated storage for transparent shadows intersections. */
+	Intersection *transparent_shadow_intersections;
+
+	/* Storage for decoupled volume steps. */
+	VolumeStep *decoupled_volume_steps[2];
+	int decoupled_volume_steps_index;
 } KernelGlobals;
 
 #endif  /* __KERNEL_CPU__ */
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 3b1111e5069..504ac2e40bc 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -59,14 +59,20 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 		/* intersect to find an opaque surface, or record all transparent surface hits */
 		Intersection hits_stack[STACK_MAX_HITS];
 		Intersection *hits = hits_stack;
-		uint max_hits = kernel_data.integrator.transparent_max_bounce - state->transparent_bounce - 1;
+		const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+		uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
 
 		/* prefer to use stack but use dynamic allocation if too deep max hits
 		 * we need max_hits + 1 storage space due to the logic in
 		 * scene_intersect_shadow_all which will first store and then check if
 		 * the limit is exceeded */
-		if(max_hits + 1 > STACK_MAX_HITS)
-			hits = (Intersection*)malloc(sizeof(Intersection)*(max_hits + 1));
+		if(max_hits + 1 > STACK_MAX_HITS) {
+			if(kg->transparent_shadow_intersections == NULL) {
+				kg->transparent_shadow_intersections =
+				    (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
+			}
+			hits = kg->transparent_shadow_intersections;
+		}
 
 		uint num_hits;
 		blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
@@ -147,14 +153,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 
 			*shadow = throughput;
 
-			if(hits != hits_stack)
-				free(hits);
 			return is_zero(throughput);
 		}
-
-		/* free dynamic storage */
-		if(hits != hits_stack)
-			free(hits);
 	}
 	else {
 		Intersection isect;
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index c499773b980..224c275b03d 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -627,12 +627,30 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		step_size = kernel_data.integrator.volume_step_size;
 		/* compute exact steps in advance for malloc */
 		max_steps = max((int)ceilf(ray->t/step_size), 1);
+		/* NOTE: For the branched path tracing it's possible to have direct
+		 * and indirect light integration both having volume segments allocated.
+		 * We detect this using index in the pre-allocated memory. Currently we
+		 * only support two segments allocated at a time, if more needed some
+		 * modifications to the KernelGlobals will be needed.
+		 *
+		 * This gives us restrictions that decoupled record should only happen
+		 * in the stack manner, meaning if there's subsequent call of decoupled
+		 * record it'll need to free memory before it's caller frees memory.
+		 */
+		const int index = kg->decoupled_volume_steps_index;
+		assert(index < sizeof(kg->decoupled_volume_steps) /
+		               sizeof(*kg->decoupled_volume_steps));
 		if(max_steps > global_max_steps) {
 			max_steps = global_max_steps;
 			step_size = ray->t / (float)max_steps;
 		}
-		segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
+		if(kg->decoupled_volume_steps[index] == NULL) {
+			kg->decoupled_volume_steps[index] =
+			        (VolumeStep*)malloc(sizeof(VolumeStep)*global_max_steps);
+		}
+		segment->steps = kg->decoupled_volume_steps[index];
 		random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
+		++kg->decoupled_volume_steps_index;
 	}
 	else {
 		max_steps = 1;
@@ -745,8 +763,14 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
 {
-	if(segment->steps != &segment->stack_step)
-		free(segment->steps);
+	if(segment->steps != &segment->stack_step) {
+		/* NOTE: We only allow free last allocated segment.
+		 * No random order of alloc/free is supported.
+		 */
+		assert(kg->decoupled_volume_steps_index > 0);
+		assert(segment->steps == kg->decoupled_volume_steps[kg->decoupled_volume_steps_index - 1]);
+		--kg->decoupled_volume_steps_index;
+	}
 }
 
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray