6 files changed, 21 insertions, 30 deletions
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
index 107b36ce6cd..5ae56290f05 100644
--- a/intern/cycles/util/util_math_fast.h
+++ b/intern/cycles/util/util_math_fast.h
@@ -362,7 +362,7 @@ ccl_device float fast_atan2f(float y, float x)
 ccl_device float fast_log2f(float x)
 {
   /* NOTE: clamp to avoid special cases and make result "safe" from large
-   * negative values/nans. */
+   * negative values/NAN's. */
   x = clamp(x, FLT_MIN, FLT_MAX);
   unsigned bits = __float_as_uint(x);
   int exponent = (int)(bits >> 23) - 127;
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 718ec9266b1..8e8caa98a1b 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -124,7 +124,7 @@ static struct StepTy {
 template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const type &a)
 {
   if (i0 == i1 && i0 == i2 && i0 == i3) {
-    return vdupq_laneq_s32(a, i0);
+    return type(vdupq_laneq_s32(int32x4_t(a), i0));
   }
   static const uint8_t tbl[16] = {(i0 * 4) + 0,
                                   (i0 * 4) + 1,
@@ -143,7 +143,7 @@ template<class type, int i0, int i1, int i2, int i3> type shuffle_neon(const typ
                                   (i3 * 4) + 2,
                                   (i3 * 4) + 3};
 
-  return vqtbl1q_s8(int8x16_t(a), *(int8x16_t *)tbl);
+  return type(vqtbl1q_s8(int8x16_t(a), *(uint8x16_t *)tbl));
 }
 
 template<class type, int i0, int i1, int i2, int i3>
@@ -167,7 +167,7 @@ type shuffle_neon(const type &a, const type &b)
                                     (i3 * 4) + 2,
                                     (i3 * 4) + 3};
 
-    return vqtbl1q_s8(int8x16_t(b), *(int8x16_t *)tbl);
+    return type(vqtbl1q_s8(int8x16_t(b), *(uint8x16_t *)tbl));
   }
   else {
 
@@ -188,7 +188,7 @@ type shuffle_neon(const type &a, const type &b)
                                     (i3 * 4) + 2 + 16,
                                     (i3 * 4) + 3 + 16};
 
-    return vqtbl2q_s8((int8x16x2_t){a, b}, *(int8x16_t *)tbl);
+    return type(vqtbl2q_s8((int8x16x2_t){int8x16_t(a), int8x16_t(b)}, *(uint8x16_t *)tbl));
   }
 }
 #endif /* __KERNEL_NEON */
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
index 1488da46b09..4dbd5b8046e 100644
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@@ -283,7 +283,7 @@ __forceinline uint32_t popcnt(const sseb &a)
 {
 #    if defined(__KERNEL_NEON__)
   const int32x4_t mask = {1, 1, 1, 1};
-  int32x4_t t = vandq_s32(a.m128, mask);
+  int32x4_t t = vandq_s32(vreinterpretq_s32_m128(a.m128), mask);
   return vaddvq_s32(t);
 #    else
   return _mm_popcnt_u32(_mm_movemask_ps(a));
@@ -299,7 +299,7 @@ __forceinline uint32_t popcnt(const sseb &a)
 __forceinline bool reduce_and(const sseb &a)
 {
 #  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(a.m128) == -4;
+  return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) == -4;
 #  else
   return _mm_movemask_ps(a) == 0xf;
 #  endif
@@ -307,7 +307,7 @@ __forceinline bool reduce_and(const sseb &a)
 __forceinline bool reduce_or(const sseb &a)
 {
 #  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(a.m128) != 0x0;
+  return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) != 0x0;
 #  else
   return _mm_movemask_ps(a) != 0x0;
 #  endif
@@ -315,7 +315,7 @@ __forceinline bool reduce_or(const sseb &a)
 __forceinline bool all(const sseb &b)
 {
 #  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(b.m128) == -4;
+  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == -4;
 #  else
   return _mm_movemask_ps(b) == 0xf;
 #  endif
@@ -323,7 +323,7 @@ __forceinline bool all(const sseb &b)
 __forceinline bool any(const sseb &b)
 {
 #  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(b.m128) != 0x0;
+  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) != 0x0;
 #  else
   return _mm_movemask_ps(b) != 0x0;
 #  endif
@@ -331,7 +331,7 @@ __forceinline bool any(const sseb &b)
 __forceinline bool none(const sseb &b)
 {
 #  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(b.m128) == 0x0;
+  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == 0x0;
 #  else
   return _mm_movemask_ps(b) == 0x0;
 #  endif
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index d039b50a7d2..0c81ed87553 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -596,7 +596,7 @@ template<size_t i0, size_t i1, size_t i2, size_t i3>
 __forceinline const ssef shuffle(const ssef &b)
 {
 #  ifdef __KERNEL_NEON__
-  return shuffle_neon<ssef, i0, i1, i2, i3>(b.m128);
+  return shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128);
 #  else
   return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
 #  endif
@@ -625,7 +625,7 @@ __forceinline const ssef shuffle(const ssef &a, const ssef &b)
 template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b)
 {
 #  ifdef __KERNEL_NEON__
-  return shuffle<float32x4_t, i0, i0, i0, i0>(a, b);
+  return shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b);
 #  else
   return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
 #  endif
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
index 3ec69ab3700..cd51dbff2f1 100644
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@@ -446,7 +446,8 @@ template<size_t i0, size_t i1, size_t i2, size_t i3>
 __forceinline const ssei shuffle(const ssei &a)
 {
 #  ifdef __KERNEL_NEON__
-  return shuffle_neon<ssei, i0, i1, i2, i3>(a);
+  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a));
+  return vreinterpretq_m128i_s32(result);
 #  else
   return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
 #  endif
@@ -456,7 +457,9 @@ template<size_t i0, size_t i1, size_t i2, size_t i3>
 __forceinline const ssei shuffle(const ssei &a, const ssei &b)
 {
 #  ifdef __KERNEL_NEON__
-  return shuffle_neon<ssei, i0, i1, i2, i3>(a, b);
+  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a),
+                                                             vreinterpretq_s32_m128i(b));
+  return vreinterpretq_m128i_s32(result);
 #  else
   return _mm_castps_si128(
       _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
@@ -514,7 +517,7 @@ __forceinline const ssei vreduce_add(const ssei &v)
 __forceinline int reduce_min(const ssei &v)
 {
 #    ifdef __KERNEL_NEON__
-  return vminvq_s32(v);
+  return vminvq_s32(vreinterpretq_s32_m128i(v));
 #    else
   return extract<0>(vreduce_min(v));
 #    endif
@@ -522,7 +525,7 @@ __forceinline int reduce_min(const ssei &v)
 __forceinline int reduce_max(const ssei &v)
 {
 #    ifdef __KERNEL_NEON__
-  return vmaxvq_s32(v);
+  return vmaxvq_s32(vreinterpretq_s32_m128i(v));
 #    else
   return extract<0>(vreduce_max(v));
 #    endif
@@ -530,7 +533,7 @@ __forceinline int reduce_max(const ssei &v)
 __forceinline int reduce_add(const ssei &v)
 {
 #    ifdef __KERNEL_NEON__
-  return vaddvq_s32(v);
+  return vaddvq_s32(vreinterpretq_s32_m128i(v));
 #    else
   return extract<0>(vreduce_add(v));
 #    endif
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index b445ab1488f..71bf9c65911 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -21,18 +21,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Texture limits on devices. */
-#define TEX_NUM_MAX (INT_MAX >> 4)
-
 /* Color to use when textures are not found. */
 #define TEX_IMAGE_MISSING_R 1
 #define TEX_IMAGE_MISSING_G 0
 #define TEX_IMAGE_MISSING_B 1
 #define TEX_IMAGE_MISSING_A 1
 
-/* Texture type. */
-#define kernel_tex_type(tex) (tex & IMAGE_DATA_TYPE_MASK)
-
 /* Interpolation types for textures
  * cuda also use texture space to store other objects */
 typedef enum InterpolationType {
@@ -45,9 +39,6 @@ typedef enum InterpolationType {
   INTERPOLATION_NUM_TYPES,
 } InterpolationType;
 
-/* Texture types
- * Since we store the type in the lower bits of a flat index,
- * the shift and bit mask constant below need to be kept in sync. */
 typedef enum ImageDataType {
   IMAGE_DATA_TYPE_FLOAT4 = 0,
   IMAGE_DATA_TYPE_BYTE4 = 1,
@@ -75,9 +66,6 @@ typedef enum ImageAlphaType {
   IMAGE_ALPHA_NUM_TYPES,
 } ImageAlphaType;
 
-#define IMAGE_DATA_TYPE_SHIFT 4
-#define IMAGE_DATA_TYPE_MASK 0xF
-
 /* Extension types for textures.
  *
  * Defines how the image is extrapolated past its original bounds. */