diff options
-rw-r--r-- | intern/cycles/kernel/kernel_compat_cpu.h | 15 | ||||
-rw-r--r-- | intern/cycles/util/CMakeLists.txt | 1 | ||||
-rw-r--r-- | intern/cycles/util/util_avxf.h | 185 | ||||
-rw-r--r-- | intern/cycles/util/util_simd.h | 1 |
4 files changed, 202 insertions, 0 deletions
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 7b30df04550..9d1f3bdc918 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -71,6 +71,20 @@ template<typename T> struct texture { return data[index]; } +#ifdef __KERNEL_AVX__ + /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain + * compatibility with existing indicies and data structures. + */ + ccl_always_inline avxf fetch_avxf(const int index) + { + kernel_assert(index >= 0 && (index+1) < width); + ssef *ssefData = (ssef*)data; + ssef *ssefNodeData = &ssefData[index]; + return _mm256_loadu_ps((float *)ssefNodeData); + } + +#endif + #ifdef __KERNEL_SSE2__ ccl_always_inline ssef fetch_ssef(int index) { @@ -506,6 +520,7 @@ typedef texture_image<half4> texture_image_half4; /* Macros to handle different memory storage on different devices */ #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) +#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index)) #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index)) #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index)) #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size)) diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index f5674bdc15c..02ee4cd6774 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -63,6 +63,7 @@ set(SRC_HEADERS util_sky_model.cpp util_sky_model.h util_sky_model_data.h + util_avxf.h util_sseb.h util_ssef.h util_ssei.h diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h new file mode 100644 index 00000000000..2db2c4dad1a --- /dev/null +++ b/intern/cycles/util/util_avxf.h @@ -0,0 +1,185 @@ +/* + * Copyright 2016 Intel Corporation + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_AVXF_H__ +#define __UTIL_AVXF_H__ + +CCL_NAMESPACE_BEGIN + +#ifdef __KERNEL_AVX__ +struct avxf +{ + typedef avxf Float; + + enum { size = 8 }; /* Number of SIMD elements. */ + + union { + __m256 m256; + float f[8]; + int i[8]; + }; + + __forceinline avxf () {} + __forceinline avxf (const avxf& other) { m256 = other.m256; } + __forceinline avxf& operator=(const avxf& other) { m256 = other.m256; return *this; } + + __forceinline avxf(const __m256 a) : m256(a) {} + __forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps (a)) {} + + __forceinline operator const __m256&(void) const { return m256; } + __forceinline operator __m256&(void) { return m256; } + + __forceinline avxf (float a) : m256(_mm256_set1_ps(a)) {} + + __forceinline avxf(float high32x4, float low32x4) : + m256(_mm256_set_ps(high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4)) {} + + __forceinline avxf(float a3, float a2, float a1, float a0) : + m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0)) {} + + __forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) : + m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {} + + + __forceinline avxf(int a3, int a2, int a1, int a0) + { + const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0); + m256 = _mm256_castsi256_ps(foo); + } + + + __forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0) + { + const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0); + m256 = _mm256_castsi256_ps(foo); + } + + __forceinline avxf(__m128 a, __m128 b) + { + const __m256 foo = _mm256_castps128_ps256(a); + m256 = _mm256_insertf128_ps(foo, b, 1); + } + +}; + +//////////////////////////////////////////////////////////////////////////////// +/// Unary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxf mm256_sqrt(const avxf& a) { return _mm256_sqrt_ps(a.m256); } + +//////////////////////////////////////////////////////////////////////////////// +/// Binary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxf operator +(const avxf& a, const avxf& b) { return _mm256_add_ps(a.m256, b.m256); } +__forceinline const avxf operator +(const avxf& a, const float& b) { return a + avxf(b); } +__forceinline const avxf operator +(const float& a, const avxf& b) { return avxf(a) + b; } + +__forceinline const avxf operator -(const avxf& a, const avxf& b) { return _mm256_sub_ps(a.m256, b.m256); } +__forceinline const avxf operator -(const avxf& a, const float& b) { return a - avxf(b); } +__forceinline const avxf operator -(const float& a, const avxf& b) { return avxf(a) - b; } + +__forceinline const avxf operator *(const avxf& a, const avxf& b) { return _mm256_mul_ps(a.m256, b.m256); } +__forceinline const avxf operator *(const avxf& a, const float& b) { return a * avxf(b); } +__forceinline const avxf operator *(const float& a, const avxf& b) { return avxf(a) * b; } + +__forceinline const avxf operator /(const avxf& a, const avxf& b) { return _mm256_div_ps(a.m256,b.m256); } +__forceinline const avxf operator /(const avxf& a, const float& b) { return a/avxf(b); } +__forceinline const avxf operator /(const float& a, const avxf& b) { return avxf(a)/b; } + +__forceinline const avxf operator|(const avxf& a, const avxf& b) { return _mm256_or_ps(a.m256,b.m256); } + +__forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256_xor_ps(a.m256,b.m256); } + +__forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); } + +//////////////////////////////////////////////////////////////////////////////// +/// Movement/Shifting/Shuffling Functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxf shuffle(const avxf& a, const __m256i &shuf) { + return _mm256_permutevar_ps(a, shuf); +} + +template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf shuffle(const avxf& a) { + return _mm256_permutevar_ps(a, _mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0)); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a, const avxf& b) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); +} +template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a) { + return shuffle<i0,i1,i2,i3>(a,a); +} +template<size_t i0> __forceinline const avxf shuffle(const avxf& a, const avxf& b) { + return shuffle<i0,i0,i0,i0>(a, b); +} +template<size_t i0> __forceinline const avxf shuffle(const avxf& a) { + return shuffle<i0>(a,a); +} + +template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf permute(const avxf& a) { +#ifdef __KERNEL_AVX2__ + return _mm256_permutevar8x32_ps(a,_mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0)); +#else + float temp[8]; + _mm256_storeu_ps((float*)&temp, a); + return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]); +#endif +} + +template<int S0, int S1, int S2, int S3,int S4,int S5,int S6, int S7> +ccl_device_inline const avxf set_sign_bit(const avxf &a) +{ + return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31,S2 << 31,S1 << 31,S0 << 31); +} + +template<size_t S0, size_t S1, size_t S2, size_t S3,size_t S4,size_t S5,size_t S6, size_t S7> +ccl_device_inline const avxf blend(const avxf &a, const avxf &b) +{ + return _mm256_blend_ps(a,b,S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7); +} + +template<size_t S0, size_t S1, size_t S2, size_t S3 > +ccl_device_inline const avxf blend(const avxf &a, const avxf &b) +{ + return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Ternary Operators +//////////////////////////////////////////////////////////////////////////////// +__forceinline const avxf madd (const avxf& a, const avxf& b, const avxf& c) { +#ifdef __KERNEL_AVX2__ + return _mm256_fmadd_ps(a,b,c); +#else + return c+(a*b); +#endif +} + +__forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) { +#ifdef __KERNEL_AVX2__ + return _mm256_fnmadd_ps(a, b, c); +#else + return c-(a*b); +#endif +} +#endif + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 8d4d79068d6..f4f460d6cf6 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -455,6 +455,7 @@ CCL_NAMESPACE_END #include "util_sseb.h" #include "util_ssei.h" #include "util_ssef.h" +#include "util_avxf.h" #endif /* __UTIL_SIMD_TYPES_H__ */ |