Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/util/math_float8.h')
-rw-r--r--intern/cycles/util/math_float8.h483
1 files changed, 273 insertions, 210 deletions
diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h
index b538cfbe70b..755a720a10b 100644
--- a/intern/cycles/util/math_float8.h
+++ b/intern/cycles/util/math_float8.h
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
* Copyright 2022 Blender Foundation */
#ifndef __UTIL_MATH_FLOAT8_H__
@@ -10,193 +11,138 @@
CCL_NAMESPACE_BEGIN
-/*******************************************************************************
- * Declaration.
- */
-
-ccl_device_inline float8_t operator+(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator+(const float8_t a, const float f);
-ccl_device_inline float8_t operator+(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator-(const float8_t a);
-ccl_device_inline float8_t operator-(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator-(const float8_t a, const float f);
-ccl_device_inline float8_t operator-(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator*(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator*(const float8_t a, const float f);
-ccl_device_inline float8_t operator*(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator/(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator/(const float8_t a, float f);
-ccl_device_inline float8_t operator/(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator+=(float8_t a, const float8_t b);
-
-ccl_device_inline float8_t operator*=(float8_t a, const float8_t b);
-ccl_device_inline float8_t operator*=(float8_t a, float f);
-
-ccl_device_inline float8_t operator/=(float8_t a, float f);
-
-ccl_device_inline bool operator==(const float8_t a, const float8_t b);
-
-ccl_device_inline float8_t rcp(const float8_t a);
-ccl_device_inline float8_t sqrt(const float8_t a);
-ccl_device_inline float8_t sqr(const float8_t a);
-ccl_device_inline bool is_zero(const float8_t a);
-ccl_device_inline float average(const float8_t a);
-ccl_device_inline float8_t min(const float8_t a, const float8_t b);
-ccl_device_inline float8_t max(const float8_t a, const float8_t b);
-ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx);
-ccl_device_inline float8_t fabs(const float8_t a);
-ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t);
-ccl_device_inline float8_t saturate(const float8_t a);
-
-ccl_device_inline float8_t safe_divide(const float8_t a, const float b);
-ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b);
-
-ccl_device_inline float reduce_min(const float8_t a);
-ccl_device_inline float reduce_max(const float8_t a);
-ccl_device_inline float reduce_add(const float8_t a);
-
-ccl_device_inline bool isequal(const float8_t a, const float8_t b);
-
-/*******************************************************************************
- * Definition.
- */
-
-ccl_device_inline float8_t zero_float8_t()
+ccl_device_inline vfloat8 zero_vfloat8()
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_setzero_ps());
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_setzero_ps());
#else
- return make_float8_t(0.0f);
+ return make_vfloat8(0.0f);
#endif
}
-ccl_device_inline float8_t one_float8_t()
+ccl_device_inline vfloat8 one_vfloat8()
{
- return make_float8_t(1.0f);
+ return make_vfloat8(1.0f);
}
-ccl_device_inline float8_t operator+(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator+(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_add_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_add_ps(a.m256, b.m256));
#else
- return make_float8_t(
+ return make_vfloat8(
a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
#endif
}
-ccl_device_inline float8_t operator+(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator+(const vfloat8 a, const float f)
{
- return a + make_float8_t(f);
+ return a + make_vfloat8(f);
}
-ccl_device_inline float8_t operator+(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator+(const float f, const vfloat8 a)
{
- return make_float8_t(f) + a;
+ return make_vfloat8(f) + a;
}
-ccl_device_inline float8_t operator-(const float8_t a)
+ccl_device_inline vfloat8 operator-(const vfloat8 a)
{
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
__m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
- return float8_t(_mm256_xor_ps(a.m256, mask));
+ return vfloat8(_mm256_xor_ps(a.m256, mask));
#else
- return make_float8_t(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
+ return make_vfloat8(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
#endif
}
-ccl_device_inline float8_t operator-(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator-(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_sub_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_sub_ps(a.m256, b.m256));
#else
- return make_float8_t(
+ return make_vfloat8(
a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
#endif
}
-ccl_device_inline float8_t operator-(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator-(const vfloat8 a, const float f)
{
- return a - make_float8_t(f);
+ return a - make_vfloat8(f);
}
-ccl_device_inline float8_t operator-(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator-(const float f, const vfloat8 a)
{
- return make_float8_t(f) - a;
+ return make_vfloat8(f) - a;
}
-ccl_device_inline float8_t operator*(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator*(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_mul_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_mul_ps(a.m256, b.m256));
#else
- return make_float8_t(
+ return make_vfloat8(
a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
#endif
}
-ccl_device_inline float8_t operator*(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator*(const vfloat8 a, const float f)
{
- return a * make_float8_t(f);
+ return a * make_vfloat8(f);
}
-ccl_device_inline float8_t operator*(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator*(const float f, const vfloat8 a)
{
- return make_float8_t(f) * a;
+ return make_vfloat8(f) * a;
}
-ccl_device_inline float8_t operator/(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator/(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_div_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_div_ps(a.m256, b.m256));
#else
- return make_float8_t(
+ return make_vfloat8(
a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
#endif
}
-ccl_device_inline float8_t operator/(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator/(const vfloat8 a, const float f)
{
- return a / make_float8_t(f);
+ return a / make_vfloat8(f);
}
-ccl_device_inline float8_t operator/(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator/(const float f, const vfloat8 a)
{
- return make_float8_t(f) / a;
+ return make_vfloat8(f) / a;
}
-ccl_device_inline float8_t operator+=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator+=(vfloat8 a, const vfloat8 b)
{
return a = a + b;
}
-ccl_device_inline float8_t operator-=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator-=(vfloat8 a, const vfloat8 b)
{
return a = a - b;
}
-ccl_device_inline float8_t operator*=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator*=(vfloat8 a, const vfloat8 b)
{
return a = a * b;
}
-ccl_device_inline float8_t operator*=(float8_t a, float f)
+ccl_device_inline vfloat8 operator*=(vfloat8 a, float f)
{
return a = a * f;
}
-ccl_device_inline float8_t operator/=(float8_t a, float f)
+ccl_device_inline vfloat8 operator/=(vfloat8 a, float f)
{
return a = a / f;
}
-ccl_device_inline bool operator==(const float8_t a, const float8_t b)
+ccl_device_inline bool operator==(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
return (_mm256_movemask_ps(_mm256_castsi256_ps(
_mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
0b11111111) == 0b11111111;
@@ -206,132 +152,180 @@ ccl_device_inline bool operator==(const float8_t a, const float8_t b)
#endif
}
-ccl_device_inline float8_t rcp(const float8_t a)
+ccl_device_inline const vfloat8 operator^(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_rcp_ps(a.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_xor_ps(a.m256, b.m256));
#else
- return make_float8_t(1.0f / a.a,
- 1.0f / a.b,
- 1.0f / a.c,
- 1.0f / a.d,
- 1.0f / a.e,
- 1.0f / a.f,
- 1.0f / a.g,
- 1.0f / a.h);
+ return make_vfloat8(__uint_as_float(__float_as_uint(a.a) ^ __float_as_uint(b.a)),
+ __uint_as_float(__float_as_uint(a.b) ^ __float_as_uint(b.b)),
+ __uint_as_float(__float_as_uint(a.c) ^ __float_as_uint(b.c)),
+ __uint_as_float(__float_as_uint(a.d) ^ __float_as_uint(b.d)),
+ __uint_as_float(__float_as_uint(a.e) ^ __float_as_uint(b.e)),
+ __uint_as_float(__float_as_uint(a.f) ^ __float_as_uint(b.f)),
+ __uint_as_float(__float_as_uint(a.g) ^ __float_as_uint(b.g)),
+ __uint_as_float(__float_as_uint(a.h) ^ __float_as_uint(b.h)));
#endif
}
-ccl_device_inline float8_t sqrt(const float8_t a)
+ccl_device_inline vfloat8 rcp(const vfloat8 a)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_sqrt_ps(a.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_rcp_ps(a.m256));
#else
- return make_float8_t(sqrtf(a.a),
- sqrtf(a.b),
- sqrtf(a.c),
- sqrtf(a.d),
- sqrtf(a.e),
- sqrtf(a.f),
- sqrtf(a.g),
- sqrtf(a.h));
+ return make_vfloat8(1.0f / a.a,
+ 1.0f / a.b,
+ 1.0f / a.c,
+ 1.0f / a.d,
+ 1.0f / a.e,
+ 1.0f / a.f,
+ 1.0f / a.g,
+ 1.0f / a.h);
#endif
}
-ccl_device_inline float8_t sqr(const float8_t a)
+ccl_device_inline vfloat8 sqrt(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_sqrt_ps(a.m256));
+#else
+ return make_vfloat8(sqrtf(a.a),
+ sqrtf(a.b),
+ sqrtf(a.c),
+ sqrtf(a.d),
+ sqrtf(a.e),
+ sqrtf(a.f),
+ sqrtf(a.g),
+ sqrtf(a.h));
+#endif
+}
+
+ccl_device_inline vfloat8 sqr(const vfloat8 a)
{
return a * a;
}
-ccl_device_inline bool is_zero(const float8_t a)
+ccl_device_inline bool is_zero(const vfloat8 a)
{
- return a == make_float8_t(0.0f);
+ return a == make_vfloat8(0.0f);
}
-ccl_device_inline float average(const float8_t a)
+ccl_device_inline float reduce_add(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+ vfloat8 b(_mm256_hadd_ps(a.m256, a.m256));
+ vfloat8 h(_mm256_hadd_ps(b.m256, b.m256));
+ return h[0] + h[4];
+#else
+ return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
+#endif
+}
+
+ccl_device_inline float average(const vfloat8 a)
{
return reduce_add(a) / 8.0f;
}
-ccl_device_inline float8_t min(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 min(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_min_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_min_ps(a.m256, b.m256));
#else
- return make_float8_t(min(a.a, b.a),
- min(a.b, b.b),
- min(a.c, b.c),
- min(a.d, b.d),
- min(a.e, b.e),
- min(a.f, b.f),
- min(a.g, b.g),
- min(a.h, b.h));
+ return make_vfloat8(min(a.a, b.a),
+ min(a.b, b.b),
+ min(a.c, b.c),
+ min(a.d, b.d),
+ min(a.e, b.e),
+ min(a.f, b.f),
+ min(a.g, b.g),
+ min(a.h, b.h));
#endif
}
-ccl_device_inline float8_t max(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 max(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_max_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_max_ps(a.m256, b.m256));
#else
- return make_float8_t(max(a.a, b.a),
- max(a.b, b.b),
- max(a.c, b.c),
- max(a.d, b.d),
- max(a.e, b.e),
- max(a.f, b.f),
- max(a.g, b.g),
- max(a.h, b.h));
+ return make_vfloat8(max(a.a, b.a),
+ max(a.b, b.b),
+ max(a.c, b.c),
+ max(a.d, b.d),
+ max(a.e, b.e),
+ max(a.f, b.f),
+ max(a.g, b.g),
+ max(a.h, b.h));
#endif
}
-ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx)
+ccl_device_inline vfloat8 clamp(const vfloat8 a, const vfloat8 mn, const vfloat8 mx)
{
return min(max(a, mn), mx);
}
-ccl_device_inline float8_t fabs(const float8_t a)
+ccl_device_inline vfloat8 select(const vint8 mask, const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_blendv_ps(b, a, _mm256_castsi256_ps(mask)));
#else
- return make_float8_t(fabsf(a.a),
- fabsf(a.b),
- fabsf(a.c),
- fabsf(a.d),
- fabsf(a.e),
- fabsf(a.f),
- fabsf(a.g),
- fabsf(a.h));
+ return make_vfloat8((mask.a) ? a.a : b.a,
+ (mask.b) ? a.b : b.b,
+ (mask.c) ? a.c : b.c,
+ (mask.d) ? a.d : b.d,
+ (mask.e) ? a.e : b.e,
+ (mask.f) ? a.f : b.f,
+ (mask.g) ? a.g : b.g,
+ (mask.h) ? a.h : b.h);
#endif
}
-ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t)
+ccl_device_inline vfloat8 fabs(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#else
+ return make_vfloat8(fabsf(a.a),
+ fabsf(a.b),
+ fabsf(a.c),
+ fabsf(a.d),
+ fabsf(a.e),
+ fabsf(a.f),
+ fabsf(a.g),
+ fabsf(a.h));
+#endif
+}
+
+ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, float t)
+{
+ return a + t * (b - a);
+}
+
+ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, vfloat8 t)
{
return a + t * (b - a);
}
-ccl_device_inline float8_t saturate(const float8_t a)
+ccl_device_inline vfloat8 saturate(const vfloat8 a)
{
- return clamp(a, make_float8_t(0.0f), make_float8_t(1.0f));
+ return clamp(a, make_vfloat8(0.0f), make_vfloat8(1.0f));
}
-ccl_device_inline float8_t exp(float8_t v)
+ccl_device_inline vfloat8 exp(vfloat8 v)
{
- return make_float8_t(
+ return make_vfloat8(
expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
}
-ccl_device_inline float8_t log(float8_t v)
+ccl_device_inline vfloat8 log(vfloat8 v)
{
- return make_float8_t(
+ return make_vfloat8(
logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
}
-ccl_device_inline float dot(const float8_t a, const float8_t b)
+ccl_device_inline float dot(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- float8_t t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
+#ifdef __KERNEL_AVX__
+ vfloat8 t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
return t[0] + t[4];
#else
return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
@@ -339,62 +333,51 @@ ccl_device_inline float dot(const float8_t a, const float8_t b)
#endif
}
-ccl_device_inline float8_t pow(float8_t v, float e)
+ccl_device_inline vfloat8 pow(vfloat8 v, float e)
{
- return make_float8_t(powf(v.a, e),
- powf(v.b, e),
- powf(v.c, e),
- powf(v.d, e),
- powf(v.e, e),
- powf(v.f, e),
- powf(v.g, e),
- powf(v.h, e));
+ return make_vfloat8(powf(v.a, e),
+ powf(v.b, e),
+ powf(v.c, e),
+ powf(v.d, e),
+ powf(v.e, e),
+ powf(v.f, e),
+ powf(v.g, e),
+ powf(v.h, e));
}
-ccl_device_inline float reduce_min(const float8_t a)
+ccl_device_inline float reduce_min(const vfloat8 a)
{
return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
}
-ccl_device_inline float reduce_max(const float8_t a)
+ccl_device_inline float reduce_max(const vfloat8 a)
{
return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
}
-ccl_device_inline float reduce_add(const float8_t a)
-{
-#ifdef __KERNEL_AVX2__
- float8_t b(_mm256_hadd_ps(a.m256, a.m256));
- float8_t h(_mm256_hadd_ps(b.m256, b.m256));
- return h[0] + h[4];
-#else
- return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
-#endif
-}
-
-ccl_device_inline bool isequal(const float8_t a, const float8_t b)
+ccl_device_inline bool isequal(const vfloat8 a, const vfloat8 b)
{
return a == b;
}
-ccl_device_inline float8_t safe_divide(const float8_t a, const float b)
+ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const float b)
{
- return (b != 0.0f) ? a / b : make_float8_t(0.0f);
+ return (b != 0.0f) ? a / b : make_vfloat8(0.0f);
}
-ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const vfloat8 b)
{
- return make_float8_t((b.a != 0.0f) ? a.a / b.a : 0.0f,
- (b.b != 0.0f) ? a.b / b.b : 0.0f,
- (b.c != 0.0f) ? a.c / b.c : 0.0f,
- (b.d != 0.0f) ? a.d / b.d : 0.0f,
- (b.e != 0.0f) ? a.e / b.e : 0.0f,
- (b.f != 0.0f) ? a.f / b.f : 0.0f,
- (b.g != 0.0f) ? a.g / b.g : 0.0f,
- (b.h != 0.0f) ? a.h / b.h : 0.0f);
+ return make_vfloat8((b.a != 0.0f) ? a.a / b.a : 0.0f,
+ (b.b != 0.0f) ? a.b / b.b : 0.0f,
+ (b.c != 0.0f) ? a.c / b.c : 0.0f,
+ (b.d != 0.0f) ? a.d / b.d : 0.0f,
+ (b.e != 0.0f) ? a.e / b.e : 0.0f,
+ (b.f != 0.0f) ? a.f / b.f : 0.0f,
+ (b.g != 0.0f) ? a.g / b.g : 0.0f,
+ (b.h != 0.0f) ? a.h / b.h : 0.0f);
}
-ccl_device_inline float8_t ensure_finite(float8_t v)
+ccl_device_inline vfloat8 ensure_finite(vfloat8 v)
{
v.a = ensure_finite(v.a);
v.b = ensure_finite(v.b);
@@ -408,12 +391,92 @@ ccl_device_inline float8_t ensure_finite(float8_t v)
return v;
}
-ccl_device_inline bool isfinite_safe(float8_t v)
+ccl_device_inline bool isfinite_safe(vfloat8 v)
{
return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
}
+ccl_device_inline vint8 cast(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+ return vint8(_mm256_castps_si256(a));
+#else
+ return make_vint8(__float_as_int(a.a),
+ __float_as_int(a.b),
+ __float_as_int(a.c),
+ __float_as_int(a.d),
+ __float_as_int(a.e),
+ __float_as_int(a.f),
+ __float_as_int(a.g),
+ __float_as_int(a.h));
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+ccl_device_forceinline float4 low(const vfloat8 a)
+{
+# ifdef __KERNEL_AVX__
+ return float4(_mm256_extractf128_ps(a.m256, 0));
+# else
+ return make_float4(a.e, a.f, a.g, a.h);
+# endif
+}
+ccl_device_forceinline float4 high(const vfloat8 a)
+{
+# ifdef __KERNEL_AVX__
+ return float4(_mm256_extractf128_ps(a.m256, 1));
+# else
+ return make_float4(a.a, a.b, a.c, a.d);
+# endif
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+# ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0)));
+# else
+ return make_vfloat8(a[i0], a[i1], a[i2], a[i3], a[i4 + 4], a[i5 + 4], a[i6 + 4], a[i7 + 4]);
+# endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
+# else
+ return make_vfloat8(shuffle<i0, i1, i2, i3>(high(a), high(b)),
+ shuffle<i0, i1, i2, i3>(low(a), low(b)));
+# endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+ return shuffle<i0, i1, i2, i3>(a, a);
+}
+template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
+{
+ return shuffle<i0, i0, i0, i0>(a, b);
+}
+template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+ return shuffle<i0>(a, a);
+}
+
+template<size_t i> ccl_device_forceinline float extract(const vfloat8 a)
+{
+# ifdef __KERNEL_AVX__
+ __m256 b = shuffle<i, i, i, i>(a).m256;
+ return _mm256_cvtss_f32(b);
+# else
+ return a[i];
+# endif
+}
+#endif
+
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_FLOAT8_H__ */