Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorThomas Dinges <blender@dingto.org>2014-01-14 23:39:21 +0400
committerThomas Dinges <blender@dingto.org>2014-01-14 23:39:54 +0400
commit9351ac0d8577a2c76c238bbf2c365d811e986209 (patch)
tree51564853558f7219dfb67a3e095b5bb170bc35cb /intern
parentd980c3eccbd020a9ff7137659e7cbfbc5adb125d (diff)
Cycles: Skip the compilation of the dedicated SSE2 kernel on x86-64, we can assume SSE2 here, so just re-use the regular one. Saves 500kb in the blender binary.
Reviewed by: brecht Differential Revision: https://developer.blender.org/D199
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/CMakeLists.txt8
-rw-r--r--intern/cycles/SConscript69
-rw-r--r--intern/cycles/device/device_cpu.cpp30
-rw-r--r--intern/cycles/kernel/CMakeLists.txt6
-rw-r--r--intern/cycles/kernel/kernel.cpp5
-rw-r--r--intern/cycles/kernel/kernel.h9
-rw-r--r--intern/cycles/kernel/kernel_sse2.cpp7
-rw-r--r--intern/cycles/kernel/kernel_sse3.cpp7
-rw-r--r--intern/cycles/kernel/kernel_sse41.cpp7
-rw-r--r--intern/cycles/util/CMakeLists.txt1
-rw-r--r--intern/cycles/util/util_optimization.h31
11 files changed, 113 insertions, 67 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index fc193d99a57..6fa6260c81e 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -8,10 +8,6 @@ include(cmake/external_libs.cmake)
# Build Flags
-if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
- set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
-endif()
-
if(WIN32 AND MSVC)
# there is no /arch:SSE3, but intrinsics are available anyway
if(CMAKE_CL_64)
@@ -54,10 +50,6 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
add_definitions(-DCCL_NAMESPACE_END=})
-if(WITH_CYCLES_OPTIMIZED_KERNEL)
- add_definitions(-DWITH_OPTIMIZED_KERNEL)
-endif()
-
if(WITH_CYCLES_NETWORK)
add_definitions(-DWITH_NETWORK)
endif()
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index 448375a04ff..e31fb5bed96 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -75,41 +75,40 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', '
incs.append(env['BF_PTHREADS_INC'])
# optimized kernel
-if env['WITH_BF_RAYOPTIMIZATION']:
- sse2_cxxflags = Split(env['CXXFLAGS'])
- sse3_cxxflags = Split(env['CXXFLAGS'])
- sse41_cxxflags = Split(env['CXXFLAGS'])
-
- if env['OURPLATFORM'] == 'win32-vc':
- # there is no /arch:SSE3, but intrinsics are available anyway
- sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- elif env['OURPLATFORM'] == 'win64-vc':
- sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- else:
- sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
- sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
- sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
-
- defs.append('WITH_OPTIMIZED_KERNEL')
- optim_defs = defs[:]
-
- if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
- cycles_sse41 = cycles.Clone()
- sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
- cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
- defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
-
- cycles_sse3 = cycles.Clone()
- sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
- cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
-
- cycles_sse2 = cycles.Clone()
- sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
- cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
+sse2_cxxflags = Split(env['CXXFLAGS'])
+sse3_cxxflags = Split(env['CXXFLAGS'])
+sse41_cxxflags = Split(env['CXXFLAGS'])
+
+if env['OURPLATFORM'] == 'win32-vc':
+ # there is no /arch:SSE3, but intrinsics are available anyway
+ sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+ sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+ sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+elif env['OURPLATFORM'] == 'win64-vc':
+ sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+ sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+ sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+else:
+ sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
+ sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
+ sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
+
+defs.append('WITH_OPTIMIZED_KERNEL')
+optim_defs = defs[:]
+
+if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
+ cycles_sse41 = cycles.Clone()
+ sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
+ cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
+ defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
+
+cycles_sse3 = cycles.Clone()
+sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
+cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
+
+cycles_sse2 = cycles.Clone()
+sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
+cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags)
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index ea632b744dc..b29d64eb454 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -166,7 +166,6 @@ public:
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
-#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int sample = start_sample; sample < end_sample; sample++) {
@@ -189,6 +188,7 @@ public:
}
else
#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.canceled()) {
@@ -208,7 +208,10 @@ public:
task.update_progress(tile);
}
}
- else if(system_cpu_support_sse2()) {
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.canceled()) {
if(task.need_finish_queue == false)
@@ -267,7 +270,6 @@ public:
float sample_scale = 1.0f/(task.sample + 1);
if(task.rgba_half) {
-#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int y = task.y; y < task.y + task.h; y++)
@@ -276,14 +278,18 @@ public:
sample_scale, x, y, task.offset, task.stride);
}
else
-#endif
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
sample_scale, x, y, task.offset, task.stride);
}
- else if(system_cpu_support_sse2()) {
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
@@ -299,7 +305,6 @@ public:
}
}
else {
-#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int y = task.y; y < task.y + task.h; y++)
@@ -309,13 +314,17 @@ public:
}
else
#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
sample_scale, x, y, task.offset, task.stride);
}
- else if(system_cpu_support_sse2()) {
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
@@ -340,7 +349,6 @@ public:
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
-#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
@@ -352,6 +360,7 @@ public:
}
else
#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
@@ -360,7 +369,10 @@ public:
break;
}
}
- else if(system_cpu_support_sse2()) {
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 998d1a3540f..81499bbfda8 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -192,10 +192,8 @@ endif()
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
-if(WITH_CYCLES_OPTIMIZED_KERNEL)
- set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
- set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-endif()
+set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
if(WITH_CYCLES_OPTIMIZED_KERNEL_SSE41)
set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp
index 3e2727fde9a..3fe1e80890b 100644
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernel.cpp
@@ -84,6 +84,11 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
assert(0);
}
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this one with SSE2 intrinsics */
+#if defined(__x86_64__) || defined(_M_X64)
+#define __KERNEL_SSE2__
+#endif
+
/* Path Tracing */
void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 105a3887da0..b6db92f26e9 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -17,9 +17,10 @@
#ifndef __KERNEL_H__
#define __KERNEL_H__
-/* CPU Kernel Interfae */
+/* CPU Kernel Interface */
#include "util_types.h"
+#include "util_optimization.h"
CCL_NAMESPACE_BEGIN
@@ -43,7 +44,7 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
-#ifdef WITH_OPTIMIZED_KERNEL
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
int sample, int x, int y, int offset, int stride);
void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
@@ -52,7 +53,9 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
int sample, int x, int y, int offset, int stride);
void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
@@ -61,7 +64,9 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
int type, int i);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
void kernel_cpu_sse41_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
int sample, int x, int y, int offset, int stride);
void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp
index 9c69e519dca..953c3e4f9c9 100644
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernel_sse2.cpp
@@ -17,8 +17,10 @@
/* Optimized CPU kernel entry points. This file is compiled with SSE2
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-#ifdef WITH_OPTIMIZED_KERNEL
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
@@ -70,4 +72,3 @@ void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int
CCL_NAMESPACE_END
#endif
-
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
index 05877a41b4a..2a36c974191 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@@ -17,8 +17,10 @@
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-#ifdef WITH_OPTIMIZED_KERNEL
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
@@ -72,4 +74,3 @@ void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int
CCL_NAMESPACE_END
#endif
-
diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp
index 0c68fd3651b..6583feaeb45 100644
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernel_sse41.cpp
@@ -17,8 +17,10 @@
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-#ifdef WITH_OPTIMIZED_KERNEL
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
@@ -73,4 +75,3 @@ void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, in
CCL_NAMESPACE_END
#endif
-
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 389f76e6df2..df188f8236a 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -46,6 +46,7 @@ set(SRC_HEADERS
util_md5.h
util_opencl.h
util_opengl.h
+ util_optimization.h
util_param.h
util_path.h
util_progress.h
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
new file mode 100644
index 00000000000..6ffc7545335
--- /dev/null
+++ b/intern/cycles/util/util_optimization.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#if defined(__x86_64__) || defined(_M_X64)
+
+/* no SSE2 kernel on x86-64, part of regular kernel */
+#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+
+#endif
+
+#if defined(i386) || defined(_M_IX86)
+
+#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+
+#endif