Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2021-02-14 17:34:23 +0300
committerBrecht Van Lommel <brecht@blender.org>2021-02-17 18:26:24 +0300
commit8119f0aad21c3ce88e82d68ed20cd5a8edc99703 (patch)
tree38c117be872788f9858c09b96b63af6c666fe770 /intern
parentdb28411fd90b77035dddc1682bb2786da34f73e9 (diff)
Cycles: refactor intrinsic functions implementation
* Add processor independent fallbacks * Use uint32_t and uint64_t types * Remove unused functions * Better comments and less indentation Ref D8237, T78710
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/bvh/bvh.cpp2
-rw-r--r--intern/cycles/bvh/bvh_build.cpp2
-rw-r--r--intern/cycles/util/util_avxb.h8
-rw-r--r--intern/cycles/util/util_avxi.h8
-rw-r--r--intern/cycles/util/util_color.h2
-rw-r--r--intern/cycles/util/util_half.h2
-rw-r--r--intern/cycles/util/util_simd.h524
-rw-r--r--intern/cycles/util/util_sseb.h8
-rw-r--r--intern/cycles/util/util_ssef.h10
-rw-r--r--intern/cycles/util/util_ssei.h8
-rw-r--r--intern/cycles/util/util_types.h21
11 files changed, 222 insertions, 373 deletions
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 256382e63ba..050e090bddf 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -69,7 +69,7 @@ BVHLayout BVHParams::best_bvh_layout(BVHLayout requested_layout, BVHLayoutMask s
allowed_layouts_mask = supported_layouts;
}
/* We get widest from allowed ones and convert mask to actual layout. */
- const BVHLayoutMask widest_allowed_layout_mask = __bsr(allowed_layouts_mask);
+ const BVHLayoutMask widest_allowed_layout_mask = __bsr((uint32_t)allowed_layouts_mask);
return (BVHLayout)(1 << widest_allowed_layout_mask);
}
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 296f9130f43..ec85cef0851 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -851,7 +851,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
for (int i = 0; i < range.size(); i++) {
const BVHReference &ref = references[range.start() + i];
if (ref.prim_index() != -1) {
- int type_index = bitscan(ref.prim_type() & PRIMITIVE_ALL);
+ uint32_t type_index = bitscan((uint32_t)(ref.prim_type() & PRIMITIVE_ALL));
p_ref[type_index].push_back(ref);
p_type[type_index].push_back(ref.prim_type());
p_index[type_index].push_back(ref.prim_index());
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
index 34fafd188de..17d505c077a 100644
--- a/intern/cycles/util/util_avxb.h
+++ b/intern/cycles/util/util_avxb.h
@@ -191,12 +191,12 @@ __forceinline const avxb unpackhi(const avxb &a, const avxb &b)
////////////////////////////////////////////////////////////////////////////////
#if defined(__KERNEL_SSE41__)
-__forceinline size_t popcnt(const avxb &a)
+__forceinline uint32_t popcnt(const avxb &a)
{
- return __popcnt(_mm256_movemask_ps(a));
+ return _mm_popcnt_u32(_mm256_movemask_ps(a));
}
#else
-__forceinline size_t popcnt(const avxb &a)
+__forceinline uint32_t popcnt(const avxb &a)
{
return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
bool(a[7]);
@@ -224,7 +224,7 @@ __forceinline bool none(const avxb &b)
return _mm256_movemask_ps(b) == 0x0;
}
-__forceinline size_t movemask(const avxb &a)
+__forceinline uint32_t movemask(const avxb &a)
{
return _mm256_movemask_ps(a);
}
diff --git a/intern/cycles/util/util_avxi.h b/intern/cycles/util/util_avxi.h
index e658a4f848f..3db646e61f4 100644
--- a/intern/cycles/util/util_avxi.h
+++ b/intern/cycles/util/util_avxi.h
@@ -711,21 +711,21 @@ __forceinline int reduce_add(const avxi &v)
return extract<0>(extract<0>(vreduce_add(v)));
}
-__forceinline size_t select_min(const avxi &v)
+__forceinline uint32_t select_min(const avxi &v)
{
return __bsf(movemask(v == vreduce_min(v)));
}
-__forceinline size_t select_max(const avxi &v)
+__forceinline uint32_t select_max(const avxi &v)
{
return __bsf(movemask(v == vreduce_max(v)));
}
-__forceinline size_t select_min(const avxb &valid, const avxi &v)
+__forceinline uint32_t select_min(const avxb &valid, const avxi &v)
{
const avxi a = select(valid, v, avxi(pos_inf));
return __bsf(movemask(valid & (a == vreduce_min(a))));
}
-__forceinline size_t select_max(const avxb &valid, const avxi &v)
+__forceinline uint32_t select_max(const avxb &valid, const avxi &v)
{
const avxi a = select(valid, v, avxi(neg_inf));
return __bsf(movemask(valid & (a == vreduce_max(a))));
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index c6937ca78fe..1b493d0ed5e 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -20,7 +20,7 @@
#include "util/util_math.h"
#include "util/util_types.h"
-#ifdef __KERNEL_SSE2__
+#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
# include "util/util_simd.h"
#endif
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 3bac7008905..a8d4ee75e20 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -20,7 +20,7 @@
#include "util/util_math.h"
#include "util/util_types.h"
-#ifdef __KERNEL_SSE2__
+#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
# include "util/util_simd.h"
#endif
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index de0e3c39f30..3a6761c6a2f 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -18,49 +18,41 @@
#ifndef __UTIL_SIMD_TYPES_H__
#define __UTIL_SIMD_TYPES_H__
-#ifndef __KERNEL_GPU__
+#include <limits>
+#include <stdint.h>
-# include <limits>
-
-# include "util/util_defines.h"
+#include "util/util_defines.h"
/* SSE Intrinsics includes
*
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-# ifndef FREE_WINDOWS64
-
-# ifdef _MSC_VER
-# include <intrin.h>
-# elif (defined(__x86_64__) || defined(__i386__))
-# include <x86intrin.h>
-# endif
-
-# else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point.
+ *
+ * MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
* Since we can't avoid including <windows.h>, better only include that */
-# include "util/util_windows.h"
-
-# endif
-
-# if defined(__x86_64__) || defined(_M_X64)
-# define SIMD_SET_FLUSH_TO_ZERO \
- _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
- _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-# else
-# define SIMD_SET_FLUSH_TO_ZERO
-# endif
+#if defined(FREE_WINDOWS64)
+# include "util/util_windows.h"
+#elif defined(_MSC_VER)
+# include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+# include <x86intrin.h>
+#endif
+
+/* Floating Point Control, for Embree. */
+#if defined(__x86_64__) || defined(_M_X64)
+# define SIMD_SET_FLUSH_TO_ZERO \
+ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
+ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#else
+# define SIMD_SET_FLUSH_TO_ZERO
+#endif
CCL_NAMESPACE_BEGIN
-# ifdef __KERNEL_SSE2__
+/* Data structures used by SSE classes. */
+#ifdef __KERNEL_SSE2__
extern const __m128 _mm_lookupmask_ps[16];
-/* Special Types */
-
static struct TrueTy {
__forceinline operator bool() const
{
@@ -122,377 +114,281 @@ static struct PosInfTy {
static struct StepTy {
} step ccl_maybe_unused;
-/* Intrinsics Functions */
+#endif
-# if defined(__BMI__) && defined(__GNUC__)
-# ifndef _tzcnt_u32
-# define _tzcnt_u32 __tzcnt_u32
-# endif
-# ifndef _tzcnt_u64
-# define _tzcnt_u64 __tzcnt_u64
-# endif
-# endif
-
-# if defined(__LZCNT__)
-# define _lzcnt_u32 __lzcnt32
-# define _lzcnt_u64 __lzcnt64
-# endif
-
-# if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
-
-__forceinline int __popcnt(int in)
-{
- return _mm_popcnt_u32(in);
-}
+/* Intrinsics Functions
+ *
+ * For fast bit operations. */
-# if !defined(_MSC_VER)
-__forceinline unsigned int __popcnt(unsigned int in)
-{
- return _mm_popcnt_u32(in);
-}
-# endif
+#if defined(__BMI__) && defined(__GNUC__)
+# ifndef _tzcnt_u32
+# define _tzcnt_u32 __tzcnt_u32
+# endif
+# ifndef _tzcnt_u64
+# define _tzcnt_u64 __tzcnt_u64
+# endif
+#endif
-# if defined(__KERNEL_64_BIT__)
-__forceinline long long __popcnt(long long in)
-{
- return _mm_popcnt_u64(in);
-}
-__forceinline size_t __popcnt(size_t in)
-{
- return _mm_popcnt_u64(in);
-}
-# endif
+#if defined(__LZCNT__)
+# define _lzcnt_u32 __lzcnt32
+# define _lzcnt_u64 __lzcnt64
+#endif
-__forceinline int __bsf(int v)
+#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
+/* Intrinsic functions on Windows. */
+__forceinline uint32_t __bsf(uint32_t v)
{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-# else
+# else
unsigned long r = 0;
_BitScanForward(&r, v);
return r;
-# endif
+# endif
}
-__forceinline unsigned int __bsf(unsigned int v)
+__forceinline uint32_t __bsf(uint32_t v)
{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-# else
+# else
unsigned long r = 0;
_BitScanForward(&r, v);
return r;
-# endif
+# endif
}
-__forceinline int __bsr(int v)
+__forceinline uint32_t __bsr(uint32_t v)
{
unsigned long r = 0;
_BitScanReverse(&r, v);
return r;
}
-__forceinline int __btc(int v, int i)
+__forceinline uint32_t __btc(uint32_t v, uint32_t i)
{
long r = v;
_bittestandcomplement(&r, i);
return r;
}
-__forceinline int __bts(int v, int i)
+__forceinline uint32_t bitscan(uint32_t v)
{
- long r = v;
- _bittestandset(&r, i);
- return r;
-}
-
-__forceinline int __btr(int v, int i)
-{
- long r = v;
- _bittestandreset(&r, i);
- return r;
-}
-
-__forceinline int bitscan(int v)
-{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u32(v);
-# else
+# else
return __bsf(v);
-# endif
-}
-
-__forceinline int clz(const int x)
-{
-# if defined(__KERNEL_AVX2__)
- return _lzcnt_u32(x);
-# else
- if (UNLIKELY(x == 0))
- return 32;
- return 31 - __bsr(x);
-# endif
-}
-
-__forceinline int __bscf(int &v)
-{
- int i = __bsf(v);
- v &= v - 1;
- return i;
-}
-
-__forceinline unsigned int __bscf(unsigned int &v)
-{
- unsigned int i = __bsf(v);
- v &= v - 1;
- return i;
+# endif
}
-# if defined(__KERNEL_64_BIT__)
+# if defined(__KERNEL_64_BIT__)
-__forceinline size_t __bsf(size_t v)
+__forceinline uint64_t __bsf(uint64_t v)
{
-# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_AVX2__)
return _tzcnt_u64(v);
-# else
+# else
unsigned long r = 0;
_BitScanForward64(&r, v);
return r;
-# endif
+# endif
}
-__forceinline size_t __bsr(size_t v)
+__forceinline uint64_t __bsr(uint64_t v)
{
unsigned long r = 0;
_BitScanReverse64(&r, v);
return r;
}
-__forceinline size_t __btc(size_t v, size_t i)
+__forceinline uint64_t __btc(uint64_t v, uint64_t i)
{
- size_t r = v;
+ uint64_t r = v;
_bittestandcomplement64((__int64 *)&r, i);
return r;
}
-__forceinline size_t __bts(size_t v, size_t i)
+__forceinline uint64_t bitscan(uint64_t v)
{
- __int64 r = v;
- _bittestandset64(&r, i);
- return r;
-}
-
-__forceinline size_t __btr(size_t v, size_t i)
-{
- __int64 r = v;
- _bittestandreset64(&r, i);
- return r;
-}
-
-__forceinline size_t bitscan(size_t v)
-{
-# if defined(__KERNEL_AVX2__)
-# if defined(__KERNEL_64_BIT__)
+# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_64_BIT__)
return _tzcnt_u64(v);
-# else
+# else
return _tzcnt_u32(v);
-# endif
-# else
+# endif
+# else
return __bsf(v);
-# endif
-}
-
-__forceinline size_t __bscf(size_t &v)
-{
- size_t i = __bsf(v);
- v &= v - 1;
- return i;
+# endif
}
-# endif /* __KERNEL_64_BIT__ */
+# endif /* __KERNEL_64_BIT__ */
-# else /* _WIN32 */
-
-__forceinline unsigned int __popcnt(unsigned int in)
-{
- int r = 0;
- asm("popcnt %1,%0" : "=r"(r) : "r"(in));
- return r;
-}
+#elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
+/* Instrinsic functions with x86 SSE. */
-__forceinline int __bsf(int v)
+__forceinline uint32_t __bsf(const uint32_t v)
{
- int r = 0;
+ uint32_t r = 0;
asm("bsf %1,%0" : "=r"(r) : "r"(v));
return r;
}
-__forceinline int __bsr(int v)
+__forceinline uint32_t __bsr(const uint32_t v)
{
- int r = 0;
+ uint32_t r = 0;
asm("bsr %1,%0" : "=r"(r) : "r"(v));
return r;
}
-__forceinline int __btc(int v, int i)
+__forceinline uint32_t __btc(const uint32_t v, uint32_t i)
{
- int r = 0;
+ uint32_t r = 0;
asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
return r;
}
-__forceinline int __bts(int v, int i)
-{
- int r = 0;
- asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
-}
-
-__forceinline int __btr(int v, int i)
+# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
+ !(defined(__ILP32__) && defined(__x86_64__))
+__forceinline uint64_t __bsf(const uint64_t v)
{
- int r = 0;
- asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
-}
-
-# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
- !(defined(__ILP32__) && defined(__x86_64__))
-__forceinline size_t __bsf(size_t v)
-{
- size_t r = 0;
+ uint64_t r = 0;
asm("bsf %1,%0" : "=r"(r) : "r"(v));
return r;
}
-# endif
+# endif
-__forceinline unsigned int __bsf(unsigned int v)
+__forceinline uint64_t __bsr(const uint64_t v)
{
- unsigned int r = 0;
- asm("bsf %1,%0" : "=r"(r) : "r"(v));
+ uint64_t r = 0;
+ asm("bsr %1,%0" : "=r"(r) : "r"(v));
return r;
}
-__forceinline size_t __bsr(size_t v)
+__forceinline uint64_t __btc(const uint64_t v, const uint64_t i)
{
- size_t r = 0;
- asm("bsr %1,%0" : "=r"(r) : "r"(v));
+ uint64_t r = 0;
+ asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
return r;
}
-__forceinline size_t __btc(size_t v, size_t i)
+__forceinline uint32_t bitscan(uint32_t v)
{
- size_t r = 0;
- asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
+# if defined(__KERNEL_AVX2__)
+ return _tzcnt_u32(v);
+# else
+ return __bsf(v);
+# endif
}
-__forceinline size_t __bts(size_t v, size_t i)
+# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
+ !(defined(__ILP32__) && defined(__x86_64__))
+__forceinline uint64_t bitscan(uint64_t v)
{
- size_t r = 0;
- asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
+# if defined(__KERNEL_AVX2__)
+# if defined(__KERNEL_64_BIT__)
+ return _tzcnt_u64(v);
+# else
+ return _tzcnt_u32(v);
+# endif
+# else
+ return __bsf(v);
+# endif
}
+# endif
-__forceinline size_t __btr(size_t v, size_t i)
+#else
+/* Intrinsic functions fallback for arbitrary processor. */
+__forceinline uint32_t __bsf(const uint32_t x)
{
- size_t r = 0;
- asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
+ for (int i = 0; i < 32; i++) {
+ if (x & (1U << i))
+ return i;
+ }
+ return 32;
}
-__forceinline int bitscan(int v)
+__forceinline uint32_t __bsr(const uint32_t x)
{
-# if defined(__KERNEL_AVX2__)
- return _tzcnt_u32(v);
-# else
- return __bsf(v);
-# endif
+ for (int i = 0; i < 32; i++) {
+ if (x & (1U << (31 - i)))
+ return (31 - i);
+ }
+ return 32;
}
-__forceinline unsigned int bitscan(unsigned int v)
+__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit)
{
-# if defined(__KERNEL_AVX2__)
- return _tzcnt_u32(v);
-# else
- return __bsf(v);
-# endif
+ uint32_t mask = 1U << bit;
+ return x & (~mask);
}
-# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
- !(defined(__ILP32__) && defined(__x86_64__))
-__forceinline size_t bitscan(size_t v)
+__forceinline uint32_t __bsf(const uint64_t x)
{
-# if defined(__KERNEL_AVX2__)
-# if defined(__KERNEL_64_BIT__)
- return _tzcnt_u64(v);
-# else
- return _tzcnt_u32(v);
-# endif
-# else
- return __bsf(v);
-# endif
+ for (int i = 0; i < 64; i++) {
+ if (x & (1UL << i))
+ return i;
+ }
+ return 64;
}
-# endif
-__forceinline int clz(const int x)
+__forceinline uint32_t __bsr(const uint64_t x)
{
-# if defined(__KERNEL_AVX2__)
- return _lzcnt_u32(x);
-# else
- if (UNLIKELY(x == 0))
- return 32;
- return 31 - __bsr(x);
-# endif
+ for (int i = 0; i < 64; i++) {
+ if (x & (1UL << (63 - i)))
+ return (63 - i);
+ }
+ return 64;
}
-__forceinline int __bscf(int &v)
+__forceinline uint64_t __btc(const uint64_t x, const uint32_t bit)
{
- int i = bitscan(v);
-# if defined(__KERNEL_AVX2__)
- v &= v - 1;
-# else
- v = __btc(v, i);
-# endif
- return i;
+ uint64_t mask = 1UL << bit;
+ return x & (~mask);
}
-__forceinline unsigned int __bscf(unsigned int &v)
+__forceinline uint32_t bitscan(uint32_t value)
{
- unsigned int i = bitscan(v);
- v &= v - 1;
- return i;
+ assert(value != 0);
+ uint32_t bit = 0;
+ while ((value & (1 << bit)) == 0) {
+ ++bit;
+ }
+ return bit;
}
-# if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
- !(defined(__ILP32__) && defined(__x86_64__))
-__forceinline size_t __bscf(size_t &v)
+__forceinline uint64_t bitscan(uint64_t value)
{
- size_t i = bitscan(v);
-# if defined(__KERNEL_AVX2__)
- v &= v - 1;
-# else
- v = __btc(v, i);
-# endif
- return i;
+ assert(value != 0);
+ uint64_t bit = 0;
+ while ((value & (1 << bit)) == 0) {
+ ++bit;
+ }
+ return bit;
}
-# endif
-# endif /* _WIN32 */
+#endif /* Intrinsics */
+
+/* SSE compatibility.
+ *
+ * Various utilities to smooth over differences between SSE versions and
+ * implementations. */
+#ifdef __KERNEL_SSE2__
/* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
* __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
* platforms when compiling code outside the kernel. */
-# if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
+# if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
/* Emulation of SSE4 functions with SSE2 */
-# define _MM_FROUND_TO_NEAREST_INT 0x00
-# define _MM_FROUND_TO_NEG_INF 0x01
-# define _MM_FROUND_TO_POS_INF 0x02
-# define _MM_FROUND_TO_ZERO 0x03
-# define _MM_FROUND_CUR_DIRECTION 0x04
+# define _MM_FROUND_TO_NEAREST_INT 0x00
+# define _MM_FROUND_TO_NEG_INF 0x01
+# define _MM_FROUND_TO_POS_INF 0x02
+# define _MM_FROUND_TO_ZERO 0x03
+# define _MM_FROUND_CUR_DIRECTION 0x04
-# undef _mm_blendv_ps
-# define _mm_blendv_ps _mm_blendv_ps_emu
+# undef _mm_blendv_ps
+# define _mm_blendv_ps _mm_blendv_ps_emu
__forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask)
{
__m128i isignmask = _mm_set1_epi32(0x80000000);
@@ -503,37 +399,37 @@ __forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask)
return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
}
-# undef _mm_blend_ps
-# define _mm_blend_ps _mm_blend_ps_emu
+# undef _mm_blend_ps
+# define _mm_blend_ps _mm_blend_ps_emu
__forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask)
{
assert(mask < 0x10);
return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
}
-# undef _mm_blendv_epi8
-# define _mm_blendv_epi8 _mm_blendv_epi8_emu
+# undef _mm_blendv_epi8
+# define _mm_blendv_epi8 _mm_blendv_epi8_emu
__forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask)
{
return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
}
-# undef _mm_min_epi32
-# define _mm_min_epi32 _mm_min_epi32_emu
+# undef _mm_min_epi32
+# define _mm_min_epi32 _mm_min_epi32_emu
__forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input)
{
return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
}
-# undef _mm_max_epi32
-# define _mm_max_epi32 _mm_max_epi32_emu
+# undef _mm_max_epi32
+# define _mm_max_epi32 _mm_max_epi32_emu
__forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
{
return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
}
-# undef _mm_extract_epi32
-# define _mm_extract_epi32 _mm_extract_epi32_emu
+# undef _mm_extract_epi32
+# define _mm_extract_epi32 _mm_extract_epi32_emu
__forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
{
switch (index) {
@@ -551,8 +447,8 @@ __forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
}
}
-# undef _mm_insert_epi32
-# define _mm_insert_epi32 _mm_insert_epi32_emu
+# undef _mm_insert_epi32
+# define _mm_insert_epi32 _mm_insert_epi32_emu
__forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index)
{
assert(index >= 0 && index < 4);
@@ -560,8 +456,8 @@ __forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int i
return value;
}
-# undef _mm_insert_ps
-# define _mm_insert_ps _mm_insert_ps_emu
+# undef _mm_insert_ps
+# define _mm_insert_ps _mm_insert_ps_emu
__forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index)
{
assert(index < 0x100);
@@ -569,8 +465,8 @@ __forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int ind
return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value);
}
-# undef _mm_round_ps
-# define _mm_round_ps _mm_round_ps_emu
+# undef _mm_round_ps
+# define _mm_round_ps _mm_round_ps_emu
__forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
{
switch (flags) {
@@ -586,51 +482,23 @@ __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
return value;
}
-# endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
+# endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
* _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
-# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-# undef _mm256_cvtss_f32
-# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
-# endif
-
-# else /* __KERNEL_SSE2__ */
-
-/* This section is for utility functions which operates on non-register data
- * which might be used from a non-vectorized code.
- */
-
-ccl_device_inline int bitscan(int value)
-{
- assert(value != 0);
- int bit = 0;
- while ((value & (1 << bit)) == 0) {
- ++bit;
- }
- return bit;
-}
-
-ccl_device_inline int __bsr(int value)
-{
- assert(value != 0);
- int bit = 0;
- while (value >>= 1) {
- ++bit;
- }
- return bit;
-}
+# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+# undef _mm256_cvtss_f32
+# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
+# endif
-# endif /* __KERNEL_SSE2__ */
+#endif /* __KERNEL_SSE2__ */
/* quiet unused define warnings */
-# if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
- defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+#if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
+ defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
/* do nothing */
-# endif
+#endif
CCL_NAMESPACE_END
-#endif /* __KERNEL_GPU__ */
-
#endif /* __UTIL_SIMD_TYPES_H__ */
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
index 56f8f676ba1..edf13e0c493 100644
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@@ -258,12 +258,12 @@ template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b
////////////////////////////////////////////////////////////////////////////////
# if defined(__KERNEL_SSE41__)
-__forceinline size_t popcnt(const sseb &a)
+__forceinline uint32_t popcnt(const sseb &a)
{
- return __popcnt(_mm_movemask_ps(a));
+ return _mm_popcnt_u32(_mm_movemask_ps(a));
}
# else
-__forceinline size_t popcnt(const sseb &a)
+__forceinline uint32_t popcnt(const sseb &a)
{
return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]);
}
@@ -290,7 +290,7 @@ __forceinline bool none(const sseb &b)
return _mm_movemask_ps(b) == 0x0;
}
-__forceinline size_t movemask(const sseb &a)
+__forceinline uint32_t movemask(const sseb &a)
{
return _mm_movemask_ps(a);
}
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index e9f0efb4efb..b14640ced40 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -730,27 +730,27 @@ __forceinline float reduce_add(const ssef &v)
return _mm_cvtss_f32(vreduce_add(v));
}
-__forceinline size_t select_min(const ssef &v)
+__forceinline uint32_t select_min(const ssef &v)
{
return __bsf(movemask(v == vreduce_min(v)));
}
-__forceinline size_t select_max(const ssef &v)
+__forceinline uint32_t select_max(const ssef &v)
{
return __bsf(movemask(v == vreduce_max(v)));
}
-__forceinline size_t select_min(const sseb &valid, const ssef &v)
+__forceinline uint32_t select_min(const sseb &valid, const ssef &v)
{
const ssef a = select(valid, v, ssef(pos_inf));
return __bsf(movemask(valid & (a == vreduce_min(a))));
}
-__forceinline size_t select_max(const sseb &valid, const ssef &v)
+__forceinline uint32_t select_max(const sseb &valid, const ssef &v)
{
const ssef a = select(valid, v, ssef(neg_inf));
return __bsf(movemask(valid & (a == vreduce_max(a))));
}
-__forceinline size_t movemask(const ssef &a)
+__forceinline uint32_t movemask(const ssef &a)
{
return _mm_movemask_ps(a);
}
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
index e2bf81310cc..c03ab18a6df 100644
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@@ -516,21 +516,21 @@ __forceinline int reduce_add(const ssei &v)
return extract<0>(vreduce_add(v));
}
-__forceinline size_t select_min(const ssei &v)
+__forceinline uint32_t select_min(const ssei &v)
{
return __bsf(movemask(v == vreduce_min(v)));
}
-__forceinline size_t select_max(const ssei &v)
+__forceinline uint32_t select_max(const ssei &v)
{
return __bsf(movemask(v == vreduce_max(v)));
}
-__forceinline size_t select_min(const sseb &valid, const ssei &v)
+__forceinline uint32_t select_min(const sseb &valid, const ssei &v)
{
const ssei a = select(valid, v, ssei((int)pos_inf));
return __bsf(movemask(valid & (a == vreduce_min(a))));
}
-__forceinline size_t select_max(const sseb &valid, const ssei &v)
+__forceinline uint32_t select_max(const sseb &valid, const ssei &v)
{
const ssei a = select(valid, v, ssei((int)neg_inf));
return __bsf(movemask(valid & (a == vreduce_max(a))));
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index fc80fa9696c..87358877e3c 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -23,7 +23,7 @@
/* Standard Integer Types */
-#if !defined(__KERNEL_GPU__) && !defined(_WIN32)
+#if !defined(__KERNEL_GPU__)
# include <stdint.h>
#endif
@@ -57,25 +57,6 @@ typedef unsigned long uint64_t;
#endif
#ifndef __KERNEL_GPU__
-# ifdef _WIN32
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
-
-typedef signed short int16_t;
-typedef unsigned short uint16_t;
-
-typedef signed int int32_t;
-typedef unsigned int uint32_t;
-
-typedef long long int64_t;
-typedef unsigned long long uint64_t;
-# ifdef __KERNEL_64_BIT__
-typedef int64_t ssize_t;
-# else
-typedef int32_t ssize_t;
-# endif
-# endif /* _WIN32 */
-
/* Generic Memory Pointer */
typedef uint64_t device_ptr;