Cycles: Reduce amount of malloc() calls from the kernel

This commit makes it so malloc() is only happening once per volume and once per transparent shadow query (per thread), improving scalability of the code to multiple CPU cores. Hard to measure this with a low-bottom i7 here currently, but from quick tests seems volume sampling gave about 3-5% speedup. The idea is to store allocated memory in kernel globals, which are per thread on CPU already. Reviewers: dingto, juicyfruit, lukasstockner97, maiself, brecht Reviewed By: brecht Subscribers: Blendify, nutel Differential Revision: https://developer.blender.org/D1996
author: Sergey Sharybin <sergey.vfx@gmail.com> 2016-05-17 13:30:46 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2016-05-18 11:14:24 +0300
commit: 7b356a856540a1affa5dc85360183418e6337a5a (patch)
tree: 9acee7019c696f694c97d504e1a2fe678a7f0cd1 /intern/cycles/device/device_cpu.cpp
parent: 2433a537fa12dad6cc8a1c323b1b73e5cad6cd4d (diff)
1 files changed, 36 insertions, 9 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 676b1279a80..275ee028eb4 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -213,12 +213,7 @@ public:
 				return;
 		}
 
-		KernelGlobals kg = kernel_globals;
-
-#ifdef WITH_OSL
-		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
-
+		KernelGlobals kg = thread_kernel_globals_init();
 		RenderTile tile;
 
 		void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
@@ -289,9 +284,7 @@ public:
 			}
 		}
 
-#ifdef WITH_OSL
-		OSLShader::thread_free(&kg);
-#endif
+		thread_kernel_globals_free(&kg);
 	}
 
 	void thread_film_convert(DeviceTask& task)
@@ -481,6 +474,40 @@ public:
 	{
 		task_pool.cancel();
 	}
+
+protected:
+	inline KernelGlobals thread_kernel_globals_init()
+	{
+		KernelGlobals kg = kernel_globals;
+		kg.transparent_shadow_intersections = NULL;
+		const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
+		                            sizeof(*kg.decoupled_volume_steps);
+		for(int i = 0; i < decoupled_count; ++i) {
+			kg.decoupled_volume_steps[i] = NULL;
+		}
+		kg.decoupled_volume_steps_index = 0;
+#ifdef WITH_OSL
+		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
+#endif
+		return kg;
+	}
+
+	inline void thread_kernel_globals_free(KernelGlobals *kg)
+	{
+		if(kg->transparent_shadow_intersections != NULL) {
+			free(kg->transparent_shadow_intersections);
+		}
+		const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
+		                            sizeof(*kg->decoupled_volume_steps);
+		for(int i = 0; i < decoupled_count; ++i) {
+			if(kg->decoupled_volume_steps[i] != NULL) {
+				free(kg->decoupled_volume_steps[i]);
+			}
+		}
+#ifdef WITH_OSL
+		OSLShader::thread_free(kg);
+#endif
+	}
 };
 
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
author	Sergey Sharybin <sergey.vfx@gmail.com>	2016-05-17 13:30:46 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2016-05-18 11:14:24 +0300
commit	7b356a856540a1affa5dc85360183418e6337a5a (patch)
tree	9acee7019c696f694c97d504e1a2fe678a7f0cd1 /intern/cycles/device/device_cpu.cpp
parent	2433a537fa12dad6cc8a1c323b1b73e5cad6cd4d (diff)