Cycles: AVX implantation of Perlin noise.

This patch adds an AVX implementation of Perlin noise in Cycles. An avxi type was also added as a utility based on the respective type in Intel Embree. Only 3D and 4D noise were implemented, there is no benefit for utilizing AVX in 1D and 2D noise. The SSE trilinear interpolation function was used in the AVX implementation because there is no benefit from using AVX in interpolating the last three dimensions. Differential Revision: https://developer.blender.org/D6680
author: OmarSquircleArt <mail@OmarEmara.dev> 2020-04-01 15:48:01 +0300
committer: OmarSquircleArt <mail@OmarEmara.dev> 2020-04-01 15:48:01 +0300
commit: f047d47e24fc5aab41d0b2349f41f539aa085b8f (patch)
tree: 7f27a8dc2f36d25289a03c294beb0bdf7e28e697 /intern/cycles/util
parent: 5e176d67e193b80054392b3e9190510fd90001e4 (diff)
6 files changed, 892 insertions, 28 deletions
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
index 54dd8068eca..34fafd188de 100644
--- a/intern/cycles/util/util_avxb.h
+++ b/intern/cycles/util/util_avxb.h
@@ -16,7 +16,7 @@
  */
 
 #ifndef __UTIL_AVXB_H__
-#  define __UTIL_AVXB_H__
+#define __UTIL_AVXB_H__
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,6 +53,10 @@ struct avxb {
   __forceinline avxb(const __m256 input) : m256(input)
   {
   }
+  __forceinline avxb(const __m128 &a, const __m128 &b)
+      : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1))
+  {
+  }
   __forceinline operator const __m256 &(void)const
   {
     return m256;
@@ -146,9 +150,9 @@ __forceinline const avxb operator!=(const avxb &a, const avxb &b)
 }
 __forceinline const avxb operator==(const avxb &a, const avxb &b)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
-#  else
+#else
   __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
   __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
   __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
@@ -157,16 +161,16 @@ __forceinline const avxb operator==(const avxb &a, const avxb &b)
   __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
   __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
   return _mm256_castsi256_ps(result);
-#  endif
+#endif
 }
 
 __forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
 {
-#  if defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE41__)
   return _mm256_blendv_ps(f, t, m);
-#  else
+#else
   return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
-#  endif
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -186,18 +190,18 @@ __forceinline const avxb unpackhi(const avxb &a, const avxb &b)
 /// Reduction Operations
 ////////////////////////////////////////////////////////////////////////////////
 
-#  if defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE41__)
 __forceinline size_t popcnt(const avxb &a)
 {
   return __popcnt(_mm256_movemask_ps(a));
 }
-#  else
+#else
 __forceinline size_t popcnt(const avxb &a)
 {
   return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
          bool(a[7]);
 }
-#  endif
+#endif
 
 __forceinline bool reduce_and(const avxb &a)
 {
@@ -234,8 +238,6 @@ ccl_device_inline void print_avxb(const char *label, const avxb &a)
   printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
 }
 
-#endif
-
 CCL_NAMESPACE_END
 
-//#endif
+#endif
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
index 156607e65fb..6781290bb83 100644
--- a/intern/cycles/util/util_avxf.h
+++ b/intern/cycles/util/util_avxf.h
@@ -15,7 +15,7 @@
  */
 
 #ifndef __UTIL_AVXF_H__
-#  define __UTIL_AVXF_H__
+#define __UTIL_AVXF_H__
 
 CCL_NAMESPACE_BEGIN
 
@@ -140,6 +140,11 @@ __forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2)
 /// Unary Operators
 ////////////////////////////////////////////////////////////////////////////////
 
+__forceinline const avxf cast(const __m256i &a)
+{
+  return _mm256_castsi256_ps(a);
+}
+
 __forceinline const avxf mm256_sqrt(const avxf &a)
 {
   return _mm256_sqrt_ps(a.m256);
@@ -259,16 +264,34 @@ template<size_t i0> __forceinline const avxf shuffle(const avxf &a)
   return shuffle<i0>(a, a);
 }
 
+template<size_t i> __forceinline float extract(const avxf &a)
+{
+  return _mm256_cvtss_f32(shuffle<i, i, i, i>(a));
+}
+template<> __forceinline float extract<0>(const avxf &a)
+{
+  return _mm256_cvtss_f32(a);
+}
+
+__forceinline ssef low(const avxf &a)
+{
+  return _mm256_extractf128_ps(a, 0);
+}
+__forceinline ssef high(const avxf &a)
+{
+  return _mm256_extractf128_ps(a, 1);
+}
+
 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
 __forceinline const avxf permute(const avxf &a)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-#  else
+#else
   float temp[8];
   _mm256_storeu_ps((float *)&temp, a);
   return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
-#  endif
+#endif
 }
 
 template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7>
@@ -309,39 +332,51 @@ __forceinline avxf mini(const avxf &a, const avxf &b)
 ////////////////////////////////////////////////////////////////////////////////
 __forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_fmadd_ps(a, b, c);
-#  else
+#else
   return c + (a * b);
-#  endif
+#endif
 }
 
 __forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_fnmadd_ps(a, b, c);
-#  else
+#else
   return c - (a * b);
-#  endif
+#endif
 }
 __forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_fmsub_ps(a, b, c);
-#  else
+#else
   return (a * b) - c;
-#  endif
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators
+/// Comparison Operators + Select
 ////////////////////////////////////////////////////////////////////////////////
 __forceinline const avxb operator<=(const avxf &a, const avxf &b)
 {
   return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
 }
 
-#endif
+__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f)
+{
+  return _mm256_blendv_ps(f, t, m);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Common Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t)
+{
+  return madd(t, b, (avxf(1.0f) - t) * a);
+}
 
 #ifndef _mm256_set_m128
 #  define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
@@ -352,3 +387,5 @@ __forceinline const avxb operator<=(const avxf &a, const avxf &b)
   _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
 
 CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_avxi.h b/intern/cycles/util/util_avxi.h
new file mode 100644
index 00000000000..e658a4f848f
--- /dev/null
+++ b/intern/cycles/util/util_avxi.h
@@ -0,0 +1,745 @@
+/*
+ * Copyright 2009-2013 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXI_H__
+#define __UTIL_AVXI_H__
+
+CCL_NAMESPACE_BEGIN
+
+struct avxb;
+
+struct avxi {
+  typedef avxb Mask;  // mask type for us
+  enum { size = 8 };  // number of SIMD elements
+  union {             // data
+    __m256i m256;
+#if !defined(__KERNEL_AVX2__)
+    struct {
+      __m128i l, h;
+    };
+#endif
+    int32_t v[8];
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Constructors, Assignment & Cast Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline avxi()
+  {
+  }
+  __forceinline avxi(const avxi &a)
+  {
+    m256 = a.m256;
+  }
+  __forceinline avxi &operator=(const avxi &a)
+  {
+    m256 = a.m256;
+    return *this;
+  }
+
+  __forceinline avxi(const __m256i a) : m256(a)
+  {
+  }
+  __forceinline operator const __m256i &(void)const
+  {
+    return m256;
+  }
+  __forceinline operator __m256i &(void)
+  {
+    return m256;
+  }
+
+  __forceinline explicit avxi(const ssei &a)
+      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1))
+  {
+  }
+  __forceinline avxi(const ssei &a, const ssei &b)
+      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
+  {
+  }
+#if defined(__KERNEL_AVX2__)
+  __forceinline avxi(const __m128i &a, const __m128i &b)
+      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
+  {
+  }
+#else
+  __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b)
+  {
+  }
+#endif
+  __forceinline explicit avxi(const int32_t *const a)
+      : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a)))
+  {
+  }
+  __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a))
+  {
+  }
+  __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a))
+  {
+  }
+  __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d)
+      : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a))
+  {
+  }
+  __forceinline avxi(
+      int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h)
+      : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a))
+  {
+  }
+
+  __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a))
+  {
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Constants
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256())
+  {
+  }
+#if defined(__KERNEL_AVX2__)
+  __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1))
+  {
+  }
+  __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf))
+  {
+  }
+  __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf))
+  {
+  }
+#else
+  __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1))
+  {
+  }
+  __forceinline avxi(PosInfTy)
+      : m256(_mm256_set_epi32(
+            pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf))
+  {
+  }
+  __forceinline avxi(NegInfTy)
+      : m256(_mm256_set_epi32(
+            neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf))
+  {
+  }
+#endif
+  __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))
+  {
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Array Access
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const int32_t &operator[](const size_t i) const
+  {
+    assert(i < 8);
+    return v[i];
+  }
+  __forceinline int32_t &operator[](const size_t i)
+  {
+    assert(i < 8);
+    return v[i];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxi cast(const __m256 &a)
+{
+  return _mm256_castps_si256(a);
+}
+__forceinline const avxi operator+(const avxi &a)
+{
+  return a;
+}
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator-(const avxi &a)
+{
+  return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256);
+}
+__forceinline const avxi abs(const avxi &a)
+{
+  return _mm256_abs_epi32(a.m256);
+}
+#else
+__forceinline const avxi operator-(const avxi &a)
+{
+  return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h));
+}
+__forceinline const avxi abs(const avxi &a)
+{
+  return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h));
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator+(const avxi &a, const avxi &b)
+{
+  return _mm256_add_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator+(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator+(const avxi &a, const int32_t b)
+{
+  return a + avxi(b);
+}
+__forceinline const avxi operator+(const int32_t a, const avxi &b)
+{
+  return avxi(a) + b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator-(const avxi &a, const avxi &b)
+{
+  return _mm256_sub_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator-(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator-(const avxi &a, const int32_t b)
+{
+  return a - avxi(b);
+}
+__forceinline const avxi operator-(const int32_t a, const avxi &b)
+{
+  return avxi(a) - b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator*(const avxi &a, const avxi &b)
+{
+  return _mm256_mullo_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator*(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator*(const avxi &a, const int32_t b)
+{
+  return a * avxi(b);
+}
+__forceinline const avxi operator*(const int32_t a, const avxi &b)
+{
+  return avxi(a) * b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator&(const avxi &a, const avxi &b)
+{
+  return _mm256_and_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator&(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator&(const avxi &a, const int32_t b)
+{
+  return a & avxi(b);
+}
+__forceinline const avxi operator&(const int32_t a, const avxi &b)
+{
+  return avxi(a) & b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator|(const avxi &a, const avxi &b)
+{
+  return _mm256_or_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator|(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator|(const avxi &a, const int32_t b)
+{
+  return a | avxi(b);
+}
+__forceinline const avxi operator|(const int32_t a, const avxi &b)
+{
+  return avxi(a) | b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator^(const avxi &a, const avxi &b)
+{
+  return _mm256_xor_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator^(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator^(const avxi &a, const int32_t b)
+{
+  return a ^ avxi(b);
+}
+__forceinline const avxi operator^(const int32_t a, const avxi &b)
+{
+  return avxi(a) ^ b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator<<(const avxi &a, const int32_t n)
+{
+  return _mm256_slli_epi32(a.m256, n);
+}
+__forceinline const avxi operator>>(const avxi &a, const int32_t n)
+{
+  return _mm256_srai_epi32(a.m256, n);
+}
+
+__forceinline const avxi sra(const avxi &a, const int32_t b)
+{
+  return _mm256_srai_epi32(a.m256, b);
+}
+__forceinline const avxi srl(const avxi &a, const int32_t b)
+{
+  return _mm256_srli_epi32(a.m256, b);
+}
+#else
+__forceinline const avxi operator<<(const avxi &a, const int32_t n)
+{
+  return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n));
+}
+__forceinline const avxi operator>>(const avxi &a, const int32_t n)
+{
+  return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n));
+}
+
+__forceinline const avxi sra(const avxi &a, const int32_t b)
+{
+  return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b));
+}
+__forceinline const avxi srl(const avxi &a, const int32_t b)
+{
+  return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b));
+}
+#endif
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi min(const avxi &a, const avxi &b)
+{
+  return _mm256_min_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi min(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi min(const avxi &a, const int32_t b)
+{
+  return min(a, avxi(b));
+}
+__forceinline const avxi min(const int32_t a, const avxi &b)
+{
+  return min(avxi(a), b);
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi max(const avxi &a, const avxi &b)
+{
+  return _mm256_max_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi max(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi max(const avxi &a, const int32_t b)
+{
+  return max(a, avxi(b));
+}
+__forceinline const avxi max(const int32_t a, const avxi &b)
+{
+  return max(avxi(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxi &operator+=(avxi &a, const avxi &b)
+{
+  return a = a + b;
+}
+__forceinline avxi &operator+=(avxi &a, const int32_t b)
+{
+  return a = a + b;
+}
+
+__forceinline avxi &operator-=(avxi &a, const avxi &b)
+{
+  return a = a - b;
+}
+__forceinline avxi &operator-=(avxi &a, const int32_t b)
+{
+  return a = a - b;
+}
+
+__forceinline avxi &operator*=(avxi &a, const avxi &b)
+{
+  return a = a * b;
+}
+__forceinline avxi &operator*=(avxi &a, const int32_t b)
+{
+  return a = a * b;
+}
+
+__forceinline avxi &operator&=(avxi &a, const avxi &b)
+{
+  return a = a & b;
+}
+__forceinline avxi &operator&=(avxi &a, const int32_t b)
+{
+  return a = a & b;
+}
+
+__forceinline avxi &operator|=(avxi &a, const avxi &b)
+{
+  return a = a | b;
+}
+__forceinline avxi &operator|=(avxi &a, const int32_t b)
+{
+  return a = a | b;
+}
+
+__forceinline avxi &operator^=(avxi &a, const avxi &b)
+{
+  return a = a ^ b;
+}
+__forceinline avxi &operator^=(avxi &a, const int32_t b)
+{
+  return a = a ^ b;
+}
+
+__forceinline avxi &operator<<=(avxi &a, const int32_t b)
+{
+  return a = a << b;
+}
+__forceinline avxi &operator>>=(avxi &a, const int32_t b)
+{
+  return a = a >> b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator==(const avxi &a, const avxi &b)
+{
+  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256));
+}
+#else
+__forceinline const avxb operator==(const avxi &a, const avxi &b)
+{
+  return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)),
+              _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator==(const avxi &a, const int32_t b)
+{
+  return a == avxi(b);
+}
+__forceinline const avxb operator==(const int32_t a, const avxi &b)
+{
+  return avxi(a) == b;
+}
+
+__forceinline const avxb operator!=(const avxi &a, const avxi &b)
+{
+  return !(a == b);
+}
+__forceinline const avxb operator!=(const avxi &a, const int32_t b)
+{
+  return a != avxi(b);
+}
+__forceinline const avxb operator!=(const int32_t a, const avxi &b)
+{
+  return avxi(a) != b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator<(const avxi &a, const avxi &b)
+{
+  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256));
+}
+#else
+__forceinline const avxb operator<(const avxi &a, const avxi &b)
+{
+  return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)),
+              _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator<(const avxi &a, const int32_t b)
+{
+  return a < avxi(b);
+}
+__forceinline const avxb operator<(const int32_t a, const avxi &b)
+{
+  return avxi(a) < b;
+}
+
+__forceinline const avxb operator>=(const avxi &a, const avxi &b)
+{
+  return !(a < b);
+}
+__forceinline const avxb operator>=(const avxi &a, const int32_t b)
+{
+  return a >= avxi(b);
+}
+__forceinline const avxb operator>=(const int32_t a, const avxi &b)
+{
+  return avxi(a) >= b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator>(const avxi &a, const avxi &b)
+{
+  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256));
+}
+#else
+__forceinline const avxb operator>(const avxi &a, const avxi &b)
+{
+  return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)),
+              _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator>(const avxi &a, const int32_t b)
+{
+  return a > avxi(b);
+}
+__forceinline const avxb operator>(const int32_t a, const avxi &b)
+{
+  return avxi(a) > b;
+}
+
+__forceinline const avxb operator<=(const avxi &a, const avxi &b)
+{
+  return !(a > b);
+}
+__forceinline const avxb operator<=(const avxi &a, const int32_t b)
+{
+  return a <= avxi(b);
+}
+__forceinline const avxb operator<=(const int32_t a, const avxi &b)
+{
+  return avxi(a) <= b;
+}
+
+__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f)
+{
+  return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline avxi unpacklo(const avxi &a, const avxi &b)
+{
+  return _mm256_unpacklo_epi32(a.m256, b.m256);
+}
+__forceinline avxi unpackhi(const avxi &a, const avxi &b)
+{
+  return _mm256_unpackhi_epi32(a.m256, b.m256);
+}
+#else
+__forceinline avxi unpacklo(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+__forceinline avxi unpackhi(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+
+template<size_t i> __forceinline const avxi shuffle(const avxi &a)
+{
+  return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i)));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a)
+{
+  return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b)
+{
+  return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const avxi shuffle(const avxi &a)
+{
+  return _mm256_castps_si256(
+      _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const avxi shuffle(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b)));
+}
+template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b)));
+}
+template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b)
+{
+  return _mm256_castps_si256(
+      _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b)))));
+}
+
+__forceinline const avxi broadcast(const int *ptr)
+{
+  return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr));
+}
+template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b)
+{
+  return _mm256_insertf128_si256(a, b, i);
+}
+template<size_t i> __forceinline const ssei extract(const avxi &a)
+{
+  return _mm256_extractf128_si256(a, i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reductions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxi vreduce_min2(const avxi &v)
+{
+  return min(v, shuffle<1, 0, 3, 2>(v));
+}
+__forceinline const avxi vreduce_min4(const avxi &v)
+{
+  avxi v1 = vreduce_min2(v);
+  return min(v1, shuffle<2, 3, 0, 1>(v1));
+}
+__forceinline const avxi vreduce_min(const avxi &v)
+{
+  avxi v1 = vreduce_min4(v);
+  return min(v1, shuffle<1, 0>(v1));
+}
+
+__forceinline const avxi vreduce_max2(const avxi &v)
+{
+  return max(v, shuffle<1, 0, 3, 2>(v));
+}
+__forceinline const avxi vreduce_max4(const avxi &v)
+{
+  avxi v1 = vreduce_max2(v);
+  return max(v1, shuffle<2, 3, 0, 1>(v1));
+}
+__forceinline const avxi vreduce_max(const avxi &v)
+{
+  avxi v1 = vreduce_max4(v);
+  return max(v1, shuffle<1, 0>(v1));
+}
+
+__forceinline const avxi vreduce_add2(const avxi &v)
+{
+  return v + shuffle<1, 0, 3, 2>(v);
+}
+__forceinline const avxi vreduce_add4(const avxi &v)
+{
+  avxi v1 = vreduce_add2(v);
+  return v1 + shuffle<2, 3, 0, 1>(v1);
+}
+__forceinline const avxi vreduce_add(const avxi &v)
+{
+  avxi v1 = vreduce_add4(v);
+  return v1 + shuffle<1, 0>(v1);
+}
+
+__forceinline int reduce_min(const avxi &v)
+{
+  return extract<0>(extract<0>(vreduce_min(v)));
+}
+__forceinline int reduce_max(const avxi &v)
+{
+  return extract<0>(extract<0>(vreduce_max(v)));
+}
+__forceinline int reduce_add(const avxi &v)
+{
+  return extract<0>(extract<0>(vreduce_add(v)));
+}
+
+__forceinline size_t select_min(const avxi &v)
+{
+  return __bsf(movemask(v == vreduce_min(v)));
+}
+__forceinline size_t select_max(const avxi &v)
+{
+  return __bsf(movemask(v == vreduce_max(v)));
+}
+
+__forceinline size_t select_min(const avxb &valid, const avxi &v)
+{
+  const avxi a = select(valid, v, avxi(pos_inf));
+  return __bsf(movemask(valid & (a == vreduce_min(a))));
+}
+__forceinline size_t select_max(const avxb &valid, const avxi &v)
+{
+  const avxi a = select(valid, v, avxi(neg_inf));
+  return __bsf(movemask(valid & (a == vreduce_max(a))));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Output Operators
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_avxi(const char *label, const avxi &a)
+{
+  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index ca48758efcd..0021eec169b 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -312,6 +312,60 @@ ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
   return c;
 }
 
+#  if defined(__KERNEL_AVX__)
+ccl_device_inline avxi hash_avxi(avxi kx)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (1 << 2) + 13);
+
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (2 << 2) + 13);
+
+  b += ky;
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (3 << 2) + 13);
+
+  c += kz;
+  b += ky;
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (4 << 2) + 13);
+
+  a += kx;
+  b += ky;
+  c += kz;
+  mix(a, b, c);
+
+  a += kw;
+  final(a, b, c);
+
+  return c;
+}
+#  endif
+
 #  undef rot
 #  undef final
 #  undef mix
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index f49cfb4184d..922f5dd274e 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -75,6 +75,28 @@ static struct FalseTy {
   }
 } False ccl_maybe_unused;
 
+static struct ZeroTy {
+  __forceinline operator float() const
+  {
+    return 0;
+  }
+  __forceinline operator int() const
+  {
+    return 0;
+  }
+} zero ccl_maybe_unused;
+
+static struct OneTy {
+  __forceinline operator float() const
+  {
+    return 1;
+  }
+  __forceinline operator int() const
+  {
+    return 1;
+  }
+} one ccl_maybe_unused;
+
 static struct NegInfTy {
   __forceinline operator float() const
   {
@@ -97,6 +119,9 @@ static struct PosInfTy {
   }
 } inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
 
+static struct StepTy {
+} step ccl_maybe_unused;
+
 /* Intrinsics Functions */
 
 #    if defined(__BMI__) && defined(__GNUC__)
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index f6535848480..a721595667d 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -158,6 +158,7 @@ CCL_NAMESPACE_END
 #  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
 #    include "util/util_avxb.h"
 #    include "util/util_avxf.h"
+#    include "util/util_avxi.h"
 #  endif
 #endif
author	OmarSquircleArt <mail@OmarEmara.dev>	2020-04-01 15:48:01 +0300
committer	OmarSquircleArt <mail@OmarEmara.dev>	2020-04-01 15:48:01 +0300
commit	f047d47e24fc5aab41d0b2349f41f539aa085b8f (patch)
tree	7f27a8dc2f36d25289a03c294beb0bdf7e28e697 /intern/cycles/util
parent	5e176d67e193b80054392b3e9190510fd90001e4 (diff)