Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Sharybin <sergey.vfx@gmail.com>2018-02-14 13:23:30 +0300
committerSergey Sharybin <sergey.vfx@gmail.com>2018-08-29 16:03:09 +0300
commit73f20560529457ea177cb93e8e8eaaf44a589643 (patch)
tree45ea2ebad9adabcedd7833629421909ede9f6fb5 /intern/cycles/util
parent66f8a4c07e8a5fc166579101933264b8425a7cd1 (diff)
Cycles: Add BVH8 and packeted triangle intersection
This is an initial implementation of BVH8 optimization structure and packated triangle intersection. The aim is to get faster ray to scene intersection checks. Scene BVH4 BVH8 barbershop_interior 10:24.94 10:10.74 bmw27 02:41.25 02:38.83 classroom 08:16.49 07:56.15 fishy_cat 04:24.56 04:17.29 koro 06:03.06 06:01.45 pavillon_barcelona 09:21.26 09:02.98 victor 23:39.65 22:53.71 As memory goes, peak usage raises by about 4.7% in a complex scenes. Note that BVH8 is disabled when using OSL, this is because OSL kernel does not get per-microarchitecture optimizations and hence always considers BVH3 is used. Original BVH8 patch from Anton Gavrikov. Batched triangles intersection from Victoria Zhislina. Extra work and tests and fixes from Maxym Dmytrychenko.
Diffstat (limited to 'intern/cycles/util')
-rw-r--r--intern/cycles/util/CMakeLists.txt5
-rw-r--r--intern/cycles/util/util_avxb.h192
-rw-r--r--intern/cycles/util/util_avxf.h51
-rw-r--r--intern/cycles/util/util_debug.cpp14
-rw-r--r--intern/cycles/util/util_types.h5
-rw-r--r--intern/cycles/util/util_types_float8.h70
-rw-r--r--intern/cycles/util/util_types_float8_impl.h113
7 files changed, 446 insertions, 4 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 508f44e7c4d..291f9a9fcae 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -78,6 +78,7 @@ set(SRC_HEADERS
util_sky_model.h
util_sky_model_data.h
util_avxf.h
+ util_avxb.h
util_sseb.h
util_ssef.h
util_ssei.h
@@ -98,7 +99,9 @@ set(SRC_HEADERS
util_types_float3_impl.h
util_types_float4.h
util_types_float4_impl.h
- util_types_int2.h
+ util_types_float8.h
+ util_types_float8_impl.h
+ util_types_int2.h
util_types_int2_impl.h
util_types_int3.h
util_types_int3_impl.h
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
new file mode 100644
index 00000000000..865549be283
--- /dev/null
+++ b/intern/cycles/util/util_avxb.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2011-2013 Intel Corporation
+ * Modifications Copyright 2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXB_H__
+#define __UTIL_AVXB_H__
+
+CCL_NAMESPACE_BEGIN
+
+struct avxf;
+
+/*! 4-wide SSE bool type. */
+struct avxb
+{
+ typedef avxb Mask; // mask type
+ typedef avxf Float; // float type
+
+ enum { size = 8 }; // number of SIMD elements
+ union { __m256 m256; int32_t v[8]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline avxb ( ) {}
+ __forceinline avxb ( const avxb& other ) { m256 = other.m256; }
+ __forceinline avxb& operator=( const avxb& other ) { m256 = other.m256; return *this; }
+
+ __forceinline avxb( const __m256 input ) : m256(input) {}
+ __forceinline operator const __m256&( void ) const { return m256; }
+ __forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); }
+ __forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); }
+
+ //__forceinline avxb ( bool a )
+ // : m256(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+ //__forceinline avxb ( bool a, bool b)
+ // : m256(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+ //__forceinline avxb ( bool a, bool b, bool c, bool d)
+ // : m256(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+ //__forceinline avxb(int mask) {
+ // assert(mask >= 0 && mask < 16);
+ // m128 = _mm_lookupmask_ps[mask];
+ //}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {}
+ __forceinline avxb( TrueTy ) : m256(_mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_setzero_si256(), _mm256_setzero_si256()))) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator []( const size_t i ) const { assert(i < 8); return (_mm256_movemask_ps(m256) >> i) & 1; }
+ __forceinline int32_t& operator []( const size_t i ) { assert(i < 8); return v[i]; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator !( const avxb& a ) { return _mm256_xor_ps(a, avxb(True)); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator &( const avxb& a, const avxb& b ) { return _mm256_and_ps(a, b); }
+__forceinline const avxb operator |( const avxb& a, const avxb& b ) { return _mm256_or_ps (a, b); }
+__forceinline const avxb operator ^( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator &=( avxb& a, const avxb& b ) { return a = a & b; }
+__forceinline const avxb operator |=( avxb& a, const avxb& b ) { return a = a | b; }
+__forceinline const avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^ b; }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); }
+__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+
+__forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) {
+#if defined(__KERNEL_SSE41__)
+ return _mm256_blendv_ps(f, t, m);
+#else
+ return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a, b); }
+__forceinline const avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a, b); }
+
+#define _MM256_SHUFFLE(fp7,fp6,fp5,fp4,fp3,fp2,fp1,fp0) (((fp7) << 14) | ((fp6) << 12) | ((fp5) << 10) | ((fp4) << 8) | \
+ ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7>
+__forceinline const avxb shuffle( const avxb& a ) {
+ return _mm256_cvtepi32_ps(_mm256_shuffle_epi32(a, _MM256_SHUFFLE(i7, i6, i5, i4, i3, i2, i1, i0)));
+}
+
+/*
+template<> __forceinline const avxb shuffle<0, 1, 0, 1, 0, 1, 0, 1>( const avxb& a ) {
+ return _mm_movelh_ps(a, a);
+}
+
+template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a ) {
+ return _mm_movehl_ps(a, a);
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) {
+ return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a, const sseb& b ) {
+ return _mm_movelh_ps(a, b);
+}
+
+template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a, const sseb& b ) {
+ return _mm_movehl_ps(b, a);
+}
+
+#if defined(__KERNEL_SSE3__)
+template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); }
+template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); }
+#endif
+
+#if defined(__KERNEL_SSE41__)
+template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); }
+template<size_t dst> __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); }
+#endif
+*/
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reduction Operations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_SSE41__)
+__forceinline size_t popcnt( const avxb& a ) { return __popcnt(_mm256_movemask_ps(a)); }
+#else
+__forceinline size_t popcnt( const avxb& a ) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3])+bool(a[4])+
+ bool(a[5])+bool(a[6])+bool(a[7]); }
+#endif
+
+__forceinline bool reduce_and( const avxb& a ) { return _mm256_movemask_ps(a) == 0xf; }
+__forceinline bool reduce_or ( const avxb& a ) { return _mm256_movemask_ps(a) != 0x0; }
+__forceinline bool all ( const avxb& b ) { return _mm256_movemask_ps(b) == 0xf; }
+__forceinline bool any ( const avxb& b ) { return _mm256_movemask_ps(b) != 0x0; }
+__forceinline bool none ( const avxb& b ) { return _mm256_movemask_ps(b) == 0x0; }
+
+__forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Debug Functions
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_avxb(const char *label, const avxb &a)
+{
+ printf("%s: %df %df %df %df %df %df %df %d\n",
+ label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
+//#endif
+
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
index 2451213963a..81365f96c94 100644
--- a/intern/cycles/util/util_avxf.h
+++ b/intern/cycles/util/util_avxf.h
@@ -19,7 +19,8 @@
CCL_NAMESPACE_BEGIN
-#ifdef __KERNEL_AVX__
+struct avxb;
+
struct avxf
{
typedef avxf Float;
@@ -53,6 +54,9 @@ struct avxf
__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
+ __forceinline avxf(float3 a) :
+ m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x)) {}
+
__forceinline avxf(int a3, int a2, int a1, int a0)
{
@@ -73,8 +77,24 @@ struct avxf
m256 = _mm256_insertf128_ps(foo, b, 1);
}
+ __forceinline const float& operator [](const size_t i) const { assert(i < 8); return f[i]; }
+ __forceinline float& operator [](const size_t i) { assert(i < 8); return f[i]; }
};
+__forceinline avxf cross(const avxf& a, const avxf& b)
+{
+ avxf r(0.0, a[4]*b[5] - a[5]*b[4], a[6]*b[4] - a[4]*b[6], a[5]*b[6] - a[6]*b[5],
+ 0.0, a[0]*b[1] - a[1]*b[0], a[2]*b[0] - a[0]*b[2], a[1]*b[2] - a[2]*b[1]);
+ return r;
+}
+
+__forceinline void dot3(const avxf& a, const avxf& b, float &den, float &den2)
+{
+ const avxf t = _mm256_mul_ps(a.m256, b.m256);
+ den = ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
+ den2 = ((float*)&t)[4] + ((float*)&t)[5] + ((float*)&t)[6];
+}
+
////////////////////////////////////////////////////////////////////////////////
/// Unary Operators
////////////////////////////////////////////////////////////////////////////////
@@ -107,6 +127,9 @@ __forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256
__forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
+__forceinline const avxf max(const avxf& a, const avxf& b) { return _mm256_max_ps(a.m256, b.m256); }
+__forceinline const avxf min(const avxf& a, const avxf& b) { return _mm256_min_ps(a.m256, b.m256); }
+
////////////////////////////////////////////////////////////////////////////////
/// Movement/Shifting/Shuffling Functions
////////////////////////////////////////////////////////////////////////////////
@@ -160,6 +183,18 @@ ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
}
+//#if defined(__KERNEL_SSE41__)
+__forceinline avxf maxi(const avxf& a, const avxf& b) {
+ const avxf ci = _mm256_max_ps(a, b);
+ return ci;
+}
+
+__forceinline avxf mini(const avxf& a, const avxf& b) {
+ const avxf ci = _mm256_min_ps(a, b);
+ return ci;
+}
+//#endif
+
////////////////////////////////////////////////////////////////////////////////
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
@@ -178,6 +213,19 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
return c-(a*b);
#endif
}
+__forceinline const avxf msub(const avxf& a, const avxf& b, const avxf& c) {
+ return _mm256_fmsub_ps(a, b, c);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators
+////////////////////////////////////////////////////////////////////////////////
+#ifdef __KERNEL_AVX2__
+__forceinline const avxb operator <=(const avxf& a, const avxf& b) {
+ return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
+}
+#endif
+
#endif
#ifndef _mm256_set_m128
@@ -190,4 +238,3 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
CCL_NAMESPACE_END
-#endif
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 9a66a372822..a761c9c46de 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -57,7 +57,19 @@ void DebugFlags::CPU::reset()
#undef STRINGIFY
#undef CHECK_CPU_FLAGS
- bvh_layout = BVH_LAYOUT_DEFAULT;
+ if (getenv("CYCLES_BVH2") != NULL) {
+ bvh_layout = BVH_LAYOUT_BVH2;
+ }
+ else if (getenv("CYCLES_BVH4") != NULL) {
+ bvh_layout = BVH_LAYOUT_BVH4;
+ }
+ else if (getenv("CYCLES_BVH8") != NULL) {
+ bvh_layout = BVH_LAYOUT_BVH8;
+ }
+ else {
+ bvh_layout = BVH_LAYOUT_DEFAULT;
+ }
+
split_kernel = false;
}
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index dfe755a8789..96c549b9be5 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -121,6 +121,7 @@ CCL_NAMESPACE_END
#include "util/util_types_float2.h"
#include "util/util_types_float3.h"
#include "util/util_types_float4.h"
+#include "util/util_types_float8.h"
#include "util/util_types_vector3.h"
@@ -140,6 +141,7 @@ CCL_NAMESPACE_END
#include "util/util_types_float2_impl.h"
#include "util/util_types_float3_impl.h"
#include "util/util_types_float4_impl.h"
+#include "util/util_types_float8_impl.h"
#include "util/util_types_vector3_impl.h"
@@ -148,7 +150,10 @@ CCL_NAMESPACE_END
# include "util/util_sseb.h"
# include "util/util_ssei.h"
# include "util/util_ssef.h"
+#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+# include "util/util_avxb.h"
# include "util/util_avxf.h"
#endif
+#endif
#endif /* __UTIL_TYPES_H__ */
diff --git a/intern/cycles/util/util_types_float8.h b/intern/cycles/util/util_types_float8.h
new file mode 100644
index 00000000000..2e693d49634
--- /dev/null
+++ b/intern/cycles/util/util_types_float8.h
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __UTIL_TYPES_FLOAT8_H__
+#define __UTIL_TYPES_FLOAT8_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+
+struct ccl_try_align(16) float8 {
+#ifdef __KERNEL_AVX2__
+ union {
+ __m256 m256;
+ struct { float a, b, c, d, e, f, g, h; };
+ };
+
+ __forceinline float8();
+ __forceinline float8(const float8& a);
+ __forceinline explicit float8(const __m256& a);
+
+ __forceinline operator const __m256&(void) const;
+ __forceinline operator __m256&(void);
+
+ __forceinline float8& operator =(const float8& a);
+
+#else /* __KERNEL_AVX2__ */
+ float a, b, c, d, e, f, g, h;
+#endif /* __KERNEL_AVX2__ */
+
+ __forceinline float operator[](int i) const;
+ __forceinline float& operator[](int i);
+};
+
+ccl_device_inline float8 make_float8(float f);
+ccl_device_inline float8 make_float8(float a, float b, float c, float d,
+ float e, float f, float g, float h);
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_FLOAT8_H__ */
diff --git a/intern/cycles/util/util_types_float8_impl.h b/intern/cycles/util/util_types_float8_impl.h
new file mode 100644
index 00000000000..4fac03569e9
--- /dev/null
+++ b/intern/cycles/util/util_types_float8_impl.h
@@ -0,0 +1,113 @@
+/*
+Copyright (c) 2017, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of Intel Corporation nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__
+#define __UTIL_TYPES_FLOAT8_IMPL_H__
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util_types.h instead."
+#endif
+
+#ifndef __KERNEL_GPU__
+# include <cstdio>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+#ifdef __KERNEL_AVX2__
+__forceinline float8::float8()
+{
+}
+
+__forceinline float8::float8(const float8& f)
+ : m256(f.m256)
+{
+}
+
+__forceinline float8::float8(const __m256& f)
+ : m256(f)
+{
+}
+
+__forceinline float8::operator const __m256&(void) const
+{
+ return m256;
+}
+
+__forceinline float8::operator __m256&(void)
+{
+ return m256;
+}
+
+__forceinline float8& float8::operator =(const float8& f)
+{
+ m256 = f.m256;
+ return *this;
+}
+#endif /* __KERNEL_AVX2__ */
+
+__forceinline float float8::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 8);
+ return *(&a + i);
+}
+
+__forceinline float& float8::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 8);
+ return *(&a + i);
+}
+
+ccl_device_inline float8 make_float8(float f)
+{
+#ifdef __KERNEL_AVX2__
+ float8 r(_mm256_set1_ps(f));
+#else
+ float8 r = {f, f, f, f, f, f, f, f};
+#endif
+ return r;
+}
+
+ccl_device_inline float8 make_float8(float a, float b, float c, float d,
+ float e, float f, float g, float h)
+{
+#ifdef __KERNEL_AVX2__
+ float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
+#else
+ float8 r = {a, b, c, d, e, f, g, h};
+#endif
+ return r;
+}
+
+#endif /* __KERNEL_GPU__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */