Cycles: test code for sse 4.1 kernel and alignment for some vector types.

This is mostly work towards enabling the __KERNEL_SSE__ option to start using SIMD operations for vector math operations. This 4.1 kernel performes about 8% faster with that option but overall is still slower than without the option. WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 is the cmake flag for testing this kernel. Alignment of int3, int4, float3, float4 to 16 bytes seems to give a slight 1-2% speedup on tested systems with the current kernel already, so is enabled now.
author: Martijn Berger <martijn.berger@gmail.com> 2013-11-22 17:16:47 +0400
committer: Brecht Van Lommel <brechtvanlommel@gmail.com> 2013-11-22 17:42:41 +0400
commit: e3a79258d17e6cdca26120eab7a2c48c7c4d4a0f (patch)
tree: 77d59694458125dd7525faf59ed56ce505533981 /intern/cycles/device
parent: 5feb0d2bfe8f6723bf48073b1760b732bc6a5ceb (diff)
2 files changed, 54 insertions, 0 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index fe2368b7ea8..920223dd8a4 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -13,6 +13,10 @@ set(INC_SYS
 	${GLEW_INCLUDE_PATH}
 )
 
+if(WITH_CYCLES_OPTIMIZED_KERNEL_SSE41)
+	 add_definitions(-DWITH_CYCLES_OPTIMIZED_KERNEL_SSE41=1)
+endif()	
+
 set(SRC
 	device.cpp
 	device_cpu.cpp
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index d04c5df82fb..85a7b9c186d 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -58,6 +58,7 @@ public:
 		/* do now to avoid thread issues */
 		system_cpu_support_sse2();
 		system_cpu_support_sse3();
+		system_cpu_support_sse41();
 	}
 
 	~CPUDevice()
@@ -164,6 +165,28 @@ public:
 			int end_sample = tile.start_sample + tile.num_samples;
 
 #ifdef WITH_OPTIMIZED_KERNEL
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
+			if(system_cpu_support_sse41()) {
+				for(int sample = start_sample; sample < end_sample; sample++) {
+					if (task.get_cancel() || task_pool.canceled()) {
+						if(task.need_finish_queue == false)
+							break;
+					}
+
+					for(int y = tile.y; y < tile.y + tile.h; y++) {
+						for(int x = tile.x; x < tile.x + tile.w; x++) {
+							kernel_cpu_sse41_path_trace(&kg, render_buffer, rng_state,
+								sample, x, y, tile.offset, tile.stride);
+						}
+					}
+
+					tile.sample = sample + 1;
+
+					task.update_progress(tile);
+				}
+			}
+			else
+#endif
 			if(system_cpu_support_sse3()) {
 				for(int sample = start_sample; sample < end_sample; sample++) {
 					if (task.get_cancel() || task_pool.canceled()) {
@@ -243,6 +266,15 @@ public:
 
 		if(task.rgba_half) {
 #ifdef WITH_OPTIMIZED_KERNEL
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
+			if(system_cpu_support_sse41()) {
+				for(int y = task.y; y < task.y + task.h; y++)
+					for(int x = task.x; x < task.x + task.w; x++)
+						kernel_cpu_sse41_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+							sample_scale, x, y, task.offset, task.stride);
+			}
+			else
+#endif				
 			if(system_cpu_support_sse3()) {
 				for(int y = task.y; y < task.y + task.h; y++)
 					for(int x = task.x; x < task.x + task.w; x++)
@@ -266,6 +298,14 @@ public:
 		}
 		else {
 #ifdef WITH_OPTIMIZED_KERNEL
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
+			if(system_cpu_support_sse41()) {
+				for(int y = task.y; y < task.y + task.h; y++)
+					for(int x = task.x; x < task.x + task.w; x++)
+						kernel_cpu_sse41_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+							sample_scale, x, y, task.offset, task.stride);
+			}
+#endif			
 			if(system_cpu_support_sse3()) {
 				for(int y = task.y; y < task.y + task.h; y++)
 					for(int x = task.x; x < task.x + task.w; x++)
@@ -298,6 +338,16 @@ public:
 #endif
 
 #ifdef WITH_OPTIMIZED_KERNEL
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
+		if(system_cpu_support_sse41()) {
+			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+				kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+				if(task_pool.canceled())
+					break;
+			}
+		}
+#endif
 		if(system_cpu_support_sse3()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
author	Martijn Berger <martijn.berger@gmail.com>	2013-11-22 17:16:47 +0400
committer	Brecht Van Lommel <brechtvanlommel@gmail.com>	2013-11-22 17:42:41 +0400
commit	e3a79258d17e6cdca26120eab7a2c48c7c4d4a0f (patch)
tree	77d59694458125dd7525faf59ed56ce505533981 /intern/cycles/device
parent	5feb0d2bfe8f6723bf48073b1760b732bc6a5ceb (diff)