From 7c9d99334705498932a272f68f74121953d4974a Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Mon, 4 Feb 2013 16:12:37 +0000 Subject: Fix cycles intersection issue with overlapping faces on windows 32 bit and CPU without SSE3 support, due to 80 bit precision float register being used for one bounding box but not the one next to it. --- intern/cycles/CMakeLists.txt | 6 ++-- intern/cycles/SConscript | 25 ++++++++----- intern/cycles/device/device_cpu.cpp | 48 +++++++++++++++++++++---- intern/cycles/kernel/CMakeLists.txt | 6 ++-- intern/cycles/kernel/kernel.h | 13 +++++-- intern/cycles/kernel/kernel_bvh.h | 24 ++++++------- intern/cycles/kernel/kernel_optimized.cpp | 60 ------------------------------- intern/cycles/kernel/kernel_sse2.cpp | 60 +++++++++++++++++++++++++++++++ intern/cycles/kernel/kernel_sse3.cpp | 60 +++++++++++++++++++++++++++++++ intern/cycles/util/util_system.cpp | 15 ++++++-- intern/cycles/util/util_system.h | 3 +- 11 files changed, 223 insertions(+), 97 deletions(-) delete mode 100644 intern/cycles/kernel/kernel_optimized.cpp create mode 100644 intern/cycles/kernel/kernel_sse2.cpp create mode 100644 intern/cycles/kernel/kernel_sse3.cpp (limited to 'intern') diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 535239a9205..226218ae512 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -13,10 +13,12 @@ if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD) endif() if(WIN32 AND MSVC) - set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc") + set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc") + set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /EHsc") elseif(CMAKE_COMPILER_IS_GNUCC) - set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mfpmath=sse") + set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse") + set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mfpmath=sse") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") endif() diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript index 19af7dede9f..8a8ef9cce39 100644 --- a/intern/cycles/SConscript +++ b/intern/cycles/SConscript @@ -36,7 +36,8 @@ sources = cycles.Glob('bvh/*.cpp') + cycles.Glob('device/*.cpp') + cycles.Glob(' sources.remove(path.join('util', 'util_view.cpp')) sources.remove(path.join('render', 'film_response.cpp')) -sources.remove(path.join('kernel', 'kernel_optimized.cpp')) +sources.remove(path.join('kernel', 'kernel_sse2.cpp')) +sources.remove(path.join('kernel', 'kernel_sse3.cpp')) incs = [] defs = [] @@ -73,21 +74,29 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', ' # optimized kernel if env['WITH_BF_RAYOPTIMIZATION']: - optim_cxxflags = Split(env['CXXFLAGS']) + sse2_cxxflags = Split(env['CXXFLAGS']) + sse3_cxxflags = Split(env['CXXFLAGS']) if env['OURPLATFORM'] == 'win32-vc': - optim_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) + sse2_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) + sse3_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) elif env['OURPLATFORM'] == 'win64-vc': - optim_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) + sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) + sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split()) else: - optim_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mfpmath=sse'.split()) + sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split()) + sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mfpmath=sse'.split()) defs.append('WITH_OPTIMIZED_KERNEL') optim_defs = defs[:] - optim_sources = [path.join('kernel', 'kernel_optimized.cpp')] - cycles_optim = cycles.Clone() - cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=optim_cxxflags) + cycles_sse3 = cycles.Clone() + sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')] + cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags) + + cycles_sse2 = cycles.Clone() + sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')] + cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags) cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index a1d7706a34e..1915245bb55 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -58,7 +58,8 @@ public: #endif /* do now to avoid thread issues */ - system_cpu_support_optimized(); + system_cpu_support_sse2(); + system_cpu_support_sse3(); } ~CPUDevice() @@ -170,7 +171,7 @@ public: int end_sample = tile.start_sample + tile.num_samples; #ifdef WITH_OPTIMIZED_KERNEL - if(system_cpu_support_optimized()) { + if(system_cpu_support_sse2()) { for(int sample = start_sample; sample < end_sample; sample++) { if (task.get_cancel() || task_pool.cancelled()) { if(task.need_finish_queue == false) @@ -179,7 +180,26 @@ public: for(int y = tile.y; y < tile.y + tile.h; y++) { for(int x = tile.x; x < tile.x + tile.w; x++) { - kernel_cpu_optimized_path_trace(&kg, render_buffer, rng_state, + kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(tile); + } + } + else if(system_cpu_support_sse3()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || task_pool.cancelled()) { + if(task.need_finish_queue == false) + break; + } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state, sample, x, y, tile.offset, tile.stride); } } @@ -227,10 +247,16 @@ public: void thread_tonemap(DeviceTask& task) { #ifdef WITH_OPTIMIZED_KERNEL - if(system_cpu_support_optimized()) { + if(system_cpu_support_sse2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer, + task.sample, task.resolution, x, y, task.offset, task.stride); + } + else if(system_cpu_support_sse3()) { for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_optimized_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer, + kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer, task.sample, task.resolution, x, y, task.offset, task.stride); } else @@ -252,9 +278,17 @@ public: #endif #ifdef WITH_OPTIMIZED_KERNEL - if(system_cpu_support_optimized()) { + if(system_cpu_support_sse2()) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { + kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + + if(task_pool.cancelled()) + break; + } + } + else if(system_cpu_support_sse3()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_optimized_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); if(task_pool.cancelled()) break; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 6d5b9a063a0..e83756b7c8a 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -12,7 +12,8 @@ set(INC_SYS set(SRC kernel.cpp - kernel_optimized.cpp + kernel_sse2.cpp + kernel_sse3.cpp kernel.cl kernel.cu ) @@ -149,7 +150,8 @@ include_directories(SYSTEM ${INC_SYS}) add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS}) if(WITH_CYCLES_OPTIMIZED_KERNEL) - set_source_files_properties(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_OPTIMIZED_KERNEL_FLAGS}") + set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") endif() if(WITH_CYCLES_CUDA) diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 26c0bcd6d1a..20ea5a61906 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -44,11 +44,18 @@ void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); #ifdef WITH_OPTIMIZED_KERNEL -void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, +void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); -void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, +void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride); -void kernel_cpu_optimized_shader(KernelGlobals *kg, uint4 *input, float4 *output, +void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, + int type, int i); + +void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, + int sample, int x, int y, int offset, int stride); +void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, + int sample, int resolution, int x, int y, int offset, int stride); +void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); #endif diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h index 1a85b5bbefd..2b9ebf35d0c 100644 --- a/intern/cycles/kernel/kernel_bvh.h +++ b/intern/cycles/kernel/kernel_bvh.h @@ -126,21 +126,21 @@ __device_inline void bvh_node_intersect(KernelGlobals *kg, /* intersect ray against child nodes */ float3 ood = P * idir; - float c0lox = n0xy.x * idir.x - ood.x; - float c0hix = n0xy.y * idir.x - ood.x; - float c0loy = n0xy.z * idir.y - ood.y; - float c0hiy = n0xy.w * idir.y - ood.y; - float c0loz = nz.x * idir.z - ood.z; - float c0hiz = nz.y * idir.z - ood.z; + NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x; + NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x; + NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y; + NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y; + NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z; + NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z; NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - float c1loz = nz.z * idir.z - ood.z; - float c1hiz = nz.w * idir.z - ood.z; - float c1lox = n1xy.x * idir.x - ood.x; - float c1hix = n1xy.y * idir.x - ood.x; - float c1loy = n1xy.z * idir.y - ood.y; - float c1hiy = n1xy.w * idir.y - ood.y; + NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z; + NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z; + NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x; + NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x; + NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y; + NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y; NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); diff --git a/intern/cycles/kernel/kernel_optimized.cpp b/intern/cycles/kernel/kernel_optimized.cpp deleted file mode 100644 index 0b662095133..00000000000 --- a/intern/cycles/kernel/kernel_optimized.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2011, Blender Foundation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -/* Optimized CPU kernel entry points. This file is compiled with SSE3 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#ifdef WITH_OPTIMIZED_KERNEL - -#include "kernel.h" -#include "kernel_compat_cpu.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" -#include "kernel_film.h" -#include "kernel_path.h" -#include "kernel_displace.h" - -CCL_NAMESPACE_BEGIN - -/* Path Tracing */ - -void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) -{ - kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); -} - -/* Tonemapping */ - -void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride) -{ - kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y, offset, stride); -} - -/* Shader Evaluate */ - -void kernel_cpu_optimized_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) -{ - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); -} - -CCL_NAMESPACE_END - -#endif - diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp new file mode 100644 index 00000000000..7947107a43c --- /dev/null +++ b/intern/cycles/kernel/kernel_sse2.cpp @@ -0,0 +1,60 @@ +/* + * Copyright 2011, Blender Foundation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#ifdef WITH_OPTIMIZED_KERNEL + +#include "kernel.h" +#include "kernel_compat_cpu.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_film.h" +#include "kernel_path.h" +#include "kernel_displace.h" + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ + +void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) +{ + kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); +} + +/* Tonemapping */ + +void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride) +{ + kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y, offset, stride); +} + +/* Shader Evaluate */ + +void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +{ + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); +} + +CCL_NAMESPACE_END + +#endif + diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp new file mode 100644 index 00000000000..9a8b389cf68 --- /dev/null +++ b/intern/cycles/kernel/kernel_sse3.cpp @@ -0,0 +1,60 @@ +/* + * Copyright 2011, Blender Foundation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#ifdef WITH_OPTIMIZED_KERNEL + +#include "kernel.h" +#include "kernel_compat_cpu.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_film.h" +#include "kernel_path.h" +#include "kernel_displace.h" + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ + +void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) +{ + kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); +} + +/* Tonemapping */ + +void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int resolution, int x, int y, int offset, int stride) +{ + kernel_film_tonemap(kg, rgba, buffer, sample, resolution, x, y, offset, stride); +} + +/* Shader Evaluate */ + +void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +{ + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); +} + +CCL_NAMESPACE_END + +#endif + diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp index 2d9f0fffae6..4fda090e09e 100644 --- a/intern/cycles/util/util_system.cpp +++ b/intern/cycles/util/util_system.cpp @@ -136,7 +136,7 @@ struct CPUCapabilities { bool fma4; }; -bool system_cpu_support_optimized() +static CPUCapabilities& system_cpu_capabilities() { static CPUCapabilities caps; static bool caps_init = false; @@ -182,7 +182,18 @@ bool system_cpu_support_optimized() caps_init = true; } - /* optimization flags use these */ + return caps; +} + +bool system_cpu_support_sse2() +{ + CPUCapabilities& caps = system_cpu_capabilities(); + return caps.sse && caps.sse2; +} + +bool system_cpu_support_sse3() +{ + CPUCapabilities& caps = system_cpu_capabilities(); return caps.sse && caps.sse2 && caps.sse3; } diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h index f25e009a250..257112883d1 100644 --- a/intern/cycles/util/util_system.h +++ b/intern/cycles/util/util_system.h @@ -26,7 +26,8 @@ CCL_NAMESPACE_BEGIN int system_cpu_thread_count(); string system_cpu_brand_string(); int system_cpu_bits(); -bool system_cpu_support_optimized(); +bool system_cpu_support_sse2(); +bool system_cpu_support_sse3(); CCL_NAMESPACE_END -- cgit v1.2.3