Merge branch 'master' into functions

author: Jacques Lucke <jacques@blender.org> 2020-04-06 12:10:19 +0300
committer: Jacques Lucke <jacques@blender.org> 2020-04-06 12:10:19 +0300
commit: 43f895a59247ea4058cb3f019cd4dabd9ad9b0e4 (patch)
tree: 51469fde8affa6b20f67d12a4602c5f109c1a204 /intern/cycles
parent: 52606afaa60462db45e783607255f56c06fd8d73 (diff)
parent: 480ff89bf7cfb1f9ffd5ce66fbd5c65288ef04c0 (diff)
12 files changed, 1175 insertions, 134 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 2b872bb5c39..a1b063430f5 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -260,15 +260,16 @@ def list_render_passes(srl):
     if crl.use_pass_volume_indirect:           yield ("VolumeInd",                     "RGB", 'COLOR')
 
     # Cryptomatte passes.
+    crypto_depth = (crl.pass_crypto_depth + 1) // 2
     if crl.use_pass_crypto_object:
-        for i in range(0, crl.pass_crypto_depth, 2):
-            yield ("CryptoObject" + '{:02d}'.format(i//2), "RGBA", 'COLOR')
+        for i in range(0, crypto_depth):
+            yield ("CryptoObject" + '{:02d}'.format(i), "RGBA", 'COLOR')
     if crl.use_pass_crypto_material:
-        for i in range(0, crl.pass_crypto_depth, 2):
-            yield ("CryptoMaterial" + '{:02d}'.format(i//2), "RGBA", 'COLOR')
+        for i in range(0, crypto_depth):
+            yield ("CryptoMaterial" + '{:02d}'.format(i), "RGBA", 'COLOR')
     if srl.cycles.use_pass_crypto_asset:
-        for i in range(0, srl.cycles.pass_crypto_depth, 2):
-            yield ("CryptoAsset" + '{:02d}'.format(i//2), "RGBA", 'COLOR')
+        for i in range(0, crypto_depth):
+            yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR')
 
     # Denoising passes.
     if crl.use_denoising or crl.denoising_store_passes:
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 28a737c3341..9e95cdb3f20 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -633,12 +633,12 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
 
   /* Cryptomatte stores two ID/weight pairs per RGBA layer.
    * User facing parameter is the number of pairs. */
-  int crypto_depth = min(16, get_int(crp, "pass_crypto_depth"));
+  int crypto_depth = divide_up(min(16, get_int(crp, "pass_crypto_depth")), 2);
   scene->film->cryptomatte_depth = crypto_depth;
   scene->film->cryptomatte_passes = CRYPT_NONE;
   if (get_boolean(crp, "use_pass_crypto_object")) {
-    for (int i = 0; i < crypto_depth; i += 2) {
-      string passname = cryptomatte_prefix + string_printf("Object%02d", i / 2);
+    for (int i = 0; i < crypto_depth; i++) {
+      string passname = cryptomatte_prefix + string_printf("Object%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
       Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
     }
@@ -646,8 +646,8 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
                                                         CRYPT_OBJECT);
   }
   if (get_boolean(crp, "use_pass_crypto_material")) {
-    for (int i = 0; i < crypto_depth; i += 2) {
-      string passname = cryptomatte_prefix + string_printf("Material%02d", i / 2);
+    for (int i = 0; i < crypto_depth; i++) {
+      string passname = cryptomatte_prefix + string_printf("Material%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
       Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
     }
@@ -655,8 +655,8 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
                                                         CRYPT_MATERIAL);
   }
   if (get_boolean(crp, "use_pass_crypto_asset")) {
-    for (int i = 0; i < crypto_depth; i += 2) {
-      string passname = cryptomatte_prefix + string_printf("Asset%02d", i / 2);
+    for (int i = 0; i < crypto_depth; i++) {
+      string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
       Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
     }
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index a16b226d8de..914ef2089a9 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -65,7 +65,7 @@ ccl_device_noinline_cpu float perlin_1d(float x)
  * supported, we do a standard implementation, but if it is supported, we
  * do an implementation using SSE intrinsics.
  */
-#ifndef __KERNEL_SSE2__
+#if !defined(__KERNEL_SSE2__)
 
 /* ** Standard Implementation ** */
 
@@ -266,7 +266,7 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
   return r;
 }
 
-#else
+#else /* SSE is supported. */
 
 /* ** SSE Implementation ** */
 
@@ -300,6 +300,57 @@ ccl_device_inline ssef bi_mix(ssef p, ssef f)
   return mix(g, shuffle<1>(g), shuffle<1>(f));
 }
 
+ccl_device_inline ssef fade(const ssef &t)
+{
+  ssef a = madd(t, 6.0f, -15.0f);
+  ssef b = madd(t, a, 10.0f);
+  return (t * t) * (t * b);
+}
+
+/* Negate val if the nth bit of h is 1. */
+#  define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))
+
+ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
+{
+  ssei h = hash & 7;
+  ssef u = select(h < 4, x, y);
+  ssef v = 2.0f * select(h < 4, y, x);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
+}
+
+/* We use SSE to compute and interpolate 4 gradients at once:
+ *
+ *    Point  Offset from v0
+ *     v0       (0, 0)
+ *     v1       (0, 1)
+ *     v2       (1, 0)    (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(V, V + 1))
+ *     v3       (1, 1)         ^
+ *               |  |__________|       (0, 0, 1, 1) = shuffle<0, 0, 0, 0>(V, V + 1)
+ *               |                          ^
+ *               |__________________________|
+ *
+ */
+ccl_device_noinline float perlin_2d(float x, float y)
+{
+  ssei XY;
+  ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
+  ssef uv = fade(fxy);
+
+  ssei XY1 = XY + 1;
+  ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
+  ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
+
+  ssei h = hash_ssei2(X, Y);
+
+  ssef fxy1 = fxy - 1.0f;
+  ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
+  ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
+
+  ssef g = grad(h, fx, fy);
+
+  return extract<0>(bi_mix(g, uv));
+}
+
 /* SSE Trilinear Interpolation:
  *
  * The function takes three ssef inputs:
@@ -340,34 +391,12 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
   return mix(g, shuffle<1>(g), shuffle<2>(f));
 }
 
-/* SSE Quadrilinear Interpolation:
- *
- * Quadrilinear interpolation is as simple as a linear interpolation
- * between two trilinear interpolations.
- *
+/* 3D and 4D noise can be accelerated using AVX, so we first check if AVX
+ * is supported, that is, if __KERNEL_AVX__ is defined. If it is not
+ * supported, we do an SSE implementation, but if it is supported,
+ * we do an implementation using AVX intrinsics.
  */
-ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
-{
-  return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
-}
-
-ccl_device_inline ssef fade(const ssef &t)
-{
-  ssef a = madd(t, 6.0f, -15.0f);
-  ssef b = madd(t, a, 10.0f);
-  return (t * t) * (t * b);
-}
-
-/* Negate val if the nth bit of h is 1. */
-#  define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))
-
-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
-{
-  ssei h = hash & 7;
-  ssef u = select(h < 4, x, y);
-  ssef v = 2.0f * select(h < 4, y, x);
-  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
-}
+#  if !defined(__KERNEL_AVX__)
 
 ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
 {
@@ -388,37 +417,15 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }
 
-/* We use SSE to compute and interpolate 4 gradients at once:
+/* SSE Quadrilinear Interpolation:
  *
- *    Point  Offset from v0
- *     v0       (0, 0)
- *     v1       (0, 1)
- *     v2       (1, 0)    (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(V, V + 1))
- *     v3       (1, 1)         ^
- *               |  |__________|       (0, 0, 1, 1) = shuffle<0, 0, 0, 0>(V, V + 1)
- *               |                          ^
- *               |__________________________|
+ * Quadrilinear interpolation is as simple as a linear interpolation
+ * between two trilinear interpolations.
  *
  */
-ccl_device_noinline float perlin_2d(float x, float y)
+ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
 {
-  ssei XY;
-  ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
-  ssef uv = fade(fxy);
-
-  ssei XY1 = XY + 1;
-  ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
-  ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
-
-  ssei h = hash_ssei2(X, Y);
-
-  ssef fxy1 = fxy - 1.0f;
-  ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
-  ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
-
-  ssef g = grad(h, fx, fy);
-
-  return extract<0>(bi_mix(g, uv));
+  return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
 }
 
 /* We use SSE to compute and interpolate 4 gradients at once. Since we have 8
@@ -522,6 +529,148 @@ ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
 
   return extract<0>(quad_mix(g1, g2, g3, g4, uvws));
 }
+
+#  else /* AVX is supported. */
+
+/* AVX Implementation */
+
+ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z)
+{
+  avxi h = hash & 15;
+  avxf u = select(h < 8, x, y);
+  avxf vt = select((h == 12) | (h == 14), x, z);
+  avxf v = select(h < 4, y, vt);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
+}
+
+ccl_device_inline avxf
+grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w)
+{
+  avxi h = hash & 31;
+  avxf u = select(h < 24, x, y);
+  avxf v = select(h < 16, y, z);
+  avxf s = select(h < 8, z, w);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
+}
+
+/* SSE Quadrilinear Interpolation:
+ *
+ * The interpolation is done in two steps:
+ * 1. Interpolate p and q along the w axis to get s.
+ * 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final
+ *    value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the
+ *    low and high ssef from s.
+ *
+ */
+ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
+{
+  ssef fv = shuffle<3>(f);
+  avxf s = mix(p, q, avxf(fv, fv));
+  return tri_mix(low(s), high(s), f);
+}
+
+/* We use AVX to compute and interpolate 8 gradients at once.
+ *
+ *    Point  Offset from v0
+ *     v0      (0, 0, 0)
+ *     v1      (0, 0, 1)    The full avx type is computed by inserting the following
+ *     v2      (0, 1, 0)    sse types into both the low and high parts of the avx.
+ *     v3      (0, 1, 1)
+ *     v4      (1, 0, 0)
+ *     v5      (1, 0, 1)    (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
+ *     v6      (1, 1, 0)         ^
+ *     v7      (1, 1, 1)         |
+ *                 |  |__________|       (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1)
+ *                 |                          ^
+ *                 |__________________________|
+ *
+ */
+ccl_device_noinline float perlin_3d(float x, float y, float z)
+{
+  ssei XYZ;
+  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
+  ssef uvw = fade(fxyz);
+
+  ssei XYZ1 = XYZ + 1;
+  ssei X = shuffle<0>(XYZ);
+  ssei X1 = shuffle<0>(XYZ1);
+  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+
+  avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z));
+
+  ssef fxyz1 = fxyz - 1.0f;
+  ssef fx = shuffle<0>(fxyz);
+  ssef fx1 = shuffle<0>(fxyz1);
+  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+
+  avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz));
+
+  return extract<0>(tri_mix(low(g), high(g), uvw));
+}
+
+/* We use AVX to compute and interpolate 8 gradients at once. Since we have 16
+ * gradients in 4D, we need to compute two sets of gradients at the points:
+ *
+ *    Point  Offset from v0
+ *     v0     (0, 0, 0, 0)
+ *     v1     (0, 0, 1, 0)  The full avx type is computed by inserting the following
+ *     v2     (0, 1, 0, 0)  sse types into both the low and high parts of the avx.
+ *     v3     (0, 1, 1, 0)
+ *     v4     (1, 0, 0, 0)
+ *     v5     (1, 0, 1, 0)  (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
+ *     v6     (1, 1, 0, 0)    ^
+ *     v7     (1, 1, 1, 0)    |
+ *                |  |________|    (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1)
+ *                |                       ^
+ *                |_______________________|
+ *
+ *    Point  Offset from v0
+ *     v8     (0, 0, 0, 1)
+ *     v9     (0, 0, 1, 1)
+ *     v10    (0, 1, 0, 1)
+ *     v11    (0, 1, 1, 1)
+ *     v12    (1, 0, 0, 1)
+ *     v13    (1, 0, 1, 1)
+ *     v14    (1, 1, 0, 1)
+ *     v15    (1, 1, 1, 1)
+ *
+ */
+ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+{
+  ssei XYZW;
+  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
+  ssef uvws = fade(fxyzw);
+
+  ssei XYZW1 = XYZW + 1;
+  ssei X = shuffle<0>(XYZW);
+  ssei X1 = shuffle<0>(XYZW1);
+  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+  ssei W = shuffle<3>(XYZW);
+  ssei W1 = shuffle<3>(XYZW1);
+
+  avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W));
+  avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1));
+
+  ssef fxyzw1 = fxyzw - 1.0f;
+  ssef fx = shuffle<0>(fxyzw);
+  ssef fx1 = shuffle<0>(fxyzw1);
+  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+  ssef fw = shuffle<3>(fxyzw);
+  ssef fw1 = shuffle<3>(fxyzw1);
+
+  avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw));
+  avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1));
+
+  return extract<0>(quad_mix(g1, g2, uvws));
+}
+#  endif
+
+#  undef negate_if_nth_bit
+
 #endif
 
 /* Remap the output of noise to a predictable range [-1, 1].
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 2d89fb9ffba..22db8e875dc 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -165,6 +165,35 @@ bool RenderBuffers::copy_from_device()
   return true;
 }
 
+static const float *get_sample_count_pass(const vector<Pass> &passes, device_vector<float> &buffer)
+{
+  int sample_offset = 0;
+
+  for (const Pass &pass : passes) {
+    if (pass.type != PASS_SAMPLE_COUNT) {
+      sample_offset += pass.components;
+    }
+    else {
+      return buffer.data() + sample_offset;
+    }
+  }
+
+  return NULL;
+}
+
+static float get_pixel_pass_scale(const float rcp_sample,
+                                  const float *sample_count,
+                                  const int i,
+                                  const int pass_stride)
+{
+  if (sample_count) {
+    return 1.0f / fabsf(sample_count[i * pass_stride]);
+  }
+  else {
+    return rcp_sample;
+  }
+}
+
 bool RenderBuffers::get_denoising_pass_rect(
     int type, float exposure, int sample, int components, float *pixels)
 {
@@ -260,22 +289,7 @@ bool RenderBuffers::get_pass_rect(
     return false;
   }
 
-  float *sample_count = NULL;
-  if (name == "Combined") {
-    int sample_offset = 0;
-    for (size_t j = 0; j < params.passes.size(); j++) {
-      Pass &pass = params.passes[j];
-      if (pass.type != PASS_SAMPLE_COUNT) {
-        sample_offset += pass.components;
-        continue;
-      }
-      else {
-        sample_count = buffer.data() + sample_offset;
-        break;
-      }
-    }
-  }
-
+  const float *sample_count = get_sample_count_pass(params.passes, buffer);
   int pass_offset = 0;
 
   for (size_t j = 0; j < params.passes.size(); j++) {
@@ -293,8 +307,8 @@ bool RenderBuffers::get_pass_rect(
     float *in = buffer.data() + pass_offset;
     int pass_stride = params.get_passes_size();
 
-    float scale = (pass.filter) ? 1.0f / (float)sample : 1.0f;
-    float scale_exposure = (pass.exposure) ? scale * exposure : scale;
+    const float rcp_sample = 1.0f / (float)sample;
+    const float pass_exposure = (pass.exposure) ? exposure : 1.0f;
 
     int size = params.width * params.height;
 
@@ -312,28 +326,36 @@ bool RenderBuffers::get_pass_rect(
       if (type == PASS_DEPTH) {
         for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
           float f = *in;
-          pixels[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
+          pixels[0] = (f == 0.0f) ? 1e10f : f;
+        }
+      }
+      else if (type == PASS_OBJECT_ID || type == PASS_MATERIAL_ID) {
+        for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
+          pixels[0] = *in;
         }
       }
       else if (type == PASS_MIST) {
         for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = saturate(f * scale_exposure);
+          const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+          const float f = *in;
+          pixels[0] = saturate(f * scale);
         }
       }
 #ifdef WITH_CYCLES_DEBUG
       else if (type == PASS_BVH_TRAVERSED_NODES || type == PASS_BVH_TRAVERSED_INSTANCES ||
                type == PASS_BVH_INTERSECTIONS || type == PASS_RAY_BOUNCES) {
         for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
+          const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+          const float f = *in;
           pixels[0] = f * scale;
         }
       }
 #endif
       else {
         for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = f * scale_exposure;
+          const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+          const float f = *in;
+          pixels[0] = f * scale * pass_exposure;
         }
       }
     }
@@ -367,7 +389,7 @@ bool RenderBuffers::get_pass_rect(
           float3 f = make_float3(in[0], in[1], in[2]);
           float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
 
-          f = safe_divide_even_color(f * exposure, f_divide);
+          f = safe_divide_even_color(f * pass_exposure, f_divide);
 
           pixels[0] = f.x;
           pixels[1] = f.y;
@@ -377,7 +399,9 @@ bool RenderBuffers::get_pass_rect(
       else {
         /* RGB/vector */
         for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-          float3 f = make_float3(in[0], in[1], in[2]);
+          const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+          const float scale_exposure = scale * pass_exposure;
+          const float3 f = make_float3(in[0], in[1], in[2]);
 
           pixels[0] = f.x * scale_exposure;
           pixels[1] = f.y * scale_exposure;
@@ -425,7 +449,9 @@ bool RenderBuffers::get_pass_rect(
       }
       else if (type == PASS_CRYPTOMATTE) {
         for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
+          const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+          const float4 f = make_float4(in[0], in[1], in[2], in[3]);
+
           /* x and z contain integer IDs, don't rescale them.
              y and w contain matte weights, they get scaled. */
           pixels[0] = f.x;
@@ -436,12 +462,9 @@ bool RenderBuffers::get_pass_rect(
       }
       else {
         for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          if (sample_count && sample_count[i * pass_stride] < 0.0f) {
-            scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
-            scale_exposure = (pass.exposure) ? scale * exposure : scale;
-          }
-
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
+          const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+          const float scale_exposure = scale * pass_exposure;
+          const float4 f = make_float4(in[0], in[1], in[2], in[3]);
 
           pixels[0] = f.x * scale_exposure;
           pixels[1] = f.y * scale_exposure;
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index baf02901123..c29810d1494 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -76,7 +76,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
   Pass pass;
 
   pass.type = type;
-  pass.filter = true;
   pass.exposure = false;
   pass.divide_type = PASS_NONE;
   if (name) {
@@ -93,7 +92,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
       break;
     case PASS_DEPTH:
       pass.components = 1;
-      pass.filter = false;
       break;
     case PASS_MIST:
       pass.components = 1;
@@ -114,7 +112,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
     case PASS_OBJECT_ID:
     case PASS_MATERIAL_ID:
       pass.components = 1;
-      pass.filter = false;
       break;
 
     case PASS_EMISSION:
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index aae8fb404b0..0fe4fe89d5e 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -42,7 +42,6 @@ class Pass {
  public:
   PassType type;
   int components;
-  bool filter;
   bool exposure;
   PassType divide_type;
   string name;
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
index 54dd8068eca..34fafd188de 100644
--- a/intern/cycles/util/util_avxb.h
+++ b/intern/cycles/util/util_avxb.h
@@ -16,7 +16,7 @@
  */
 
 #ifndef __UTIL_AVXB_H__
-#  define __UTIL_AVXB_H__
+#define __UTIL_AVXB_H__
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,6 +53,10 @@ struct avxb {
   __forceinline avxb(const __m256 input) : m256(input)
   {
   }
+  __forceinline avxb(const __m128 &a, const __m128 &b)
+      : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1))
+  {
+  }
   __forceinline operator const __m256 &(void)const
   {
     return m256;
@@ -146,9 +150,9 @@ __forceinline const avxb operator!=(const avxb &a, const avxb &b)
 }
 __forceinline const avxb operator==(const avxb &a, const avxb &b)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
-#  else
+#else
   __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
   __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
   __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
@@ -157,16 +161,16 @@ __forceinline const avxb operator==(const avxb &a, const avxb &b)
   __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
   __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
   return _mm256_castsi256_ps(result);
-#  endif
+#endif
 }
 
 __forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
 {
-#  if defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE41__)
   return _mm256_blendv_ps(f, t, m);
-#  else
+#else
   return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
-#  endif
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -186,18 +190,18 @@ __forceinline const avxb unpackhi(const avxb &a, const avxb &b)
 /// Reduction Operations
 ////////////////////////////////////////////////////////////////////////////////
 
-#  if defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE41__)
 __forceinline size_t popcnt(const avxb &a)
 {
   return __popcnt(_mm256_movemask_ps(a));
 }
-#  else
+#else
 __forceinline size_t popcnt(const avxb &a)
 {
   return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
          bool(a[7]);
 }
-#  endif
+#endif
 
 __forceinline bool reduce_and(const avxb &a)
 {
@@ -234,8 +238,6 @@ ccl_device_inline void print_avxb(const char *label, const avxb &a)
   printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
 }
 
-#endif
-
 CCL_NAMESPACE_END
 
-//#endif
+#endif
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
index 156607e65fb..1fb3ded422f 100644
--- a/intern/cycles/util/util_avxf.h
+++ b/intern/cycles/util/util_avxf.h
@@ -15,7 +15,7 @@
  */
 
 #ifndef __UTIL_AVXF_H__
-#  define __UTIL_AVXF_H__
+#define __UTIL_AVXF_H__
 
 CCL_NAMESPACE_BEGIN
 
@@ -140,6 +140,11 @@ __forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2)
 /// Unary Operators
 ////////////////////////////////////////////////////////////////////////////////
 
+__forceinline const avxf cast(const __m256i &a)
+{
+  return _mm256_castsi256_ps(a);
+}
+
 __forceinline const avxf mm256_sqrt(const avxf &a)
 {
   return _mm256_sqrt_ps(a.m256);
@@ -259,16 +264,35 @@ template<size_t i0> __forceinline const avxf shuffle(const avxf &a)
   return shuffle<i0>(a, a);
 }
 
+template<size_t i> __forceinline float extract(const avxf &a)
+{
+  __m256 b = shuffle<i, i, i, i>(a).m256;
+  return _mm256_cvtss_f32(b);
+}
+template<> __forceinline float extract<0>(const avxf &a)
+{
+  return _mm256_cvtss_f32(a.m256);
+}
+
+__forceinline ssef low(const avxf &a)
+{
+  return _mm256_extractf128_ps(a.m256, 0);
+}
+__forceinline ssef high(const avxf &a)
+{
+  return _mm256_extractf128_ps(a.m256, 1);
+}
+
 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
 __forceinline const avxf permute(const avxf &a)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-#  else
+#else
   float temp[8];
   _mm256_storeu_ps((float *)&temp, a);
   return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
-#  endif
+#endif
 }
 
 template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7>
@@ -309,39 +333,51 @@ __forceinline avxf mini(const avxf &a, const avxf &b)
 ////////////////////////////////////////////////////////////////////////////////
 __forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_fmadd_ps(a, b, c);
-#  else
+#else
   return c + (a * b);
-#  endif
+#endif
 }
 
 __forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_fnmadd_ps(a, b, c);
-#  else
+#else
   return c - (a * b);
-#  endif
+#endif
 }
 __forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   return _mm256_fmsub_ps(a, b, c);
-#  else
+#else
   return (a * b) - c;
-#  endif
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators
+/// Comparison Operators + Select
 ////////////////////////////////////////////////////////////////////////////////
 __forceinline const avxb operator<=(const avxf &a, const avxf &b)
 {
   return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
 }
 
-#endif
+__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f)
+{
+  return _mm256_blendv_ps(f, t, m);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Common Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t)
+{
+  return madd(t, b, (avxf(1.0f) - t) * a);
+}
 
 #ifndef _mm256_set_m128
 #  define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
@@ -352,3 +388,5 @@ __forceinline const avxb operator<=(const avxf &a, const avxf &b)
   _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
 
 CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_avxi.h b/intern/cycles/util/util_avxi.h
new file mode 100644
index 00000000000..e658a4f848f
--- /dev/null
+++ b/intern/cycles/util/util_avxi.h
@@ -0,0 +1,745 @@
+/*
+ * Copyright 2009-2013 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXI_H__
+#define __UTIL_AVXI_H__
+
+CCL_NAMESPACE_BEGIN
+
+struct avxb;
+
+struct avxi {
+  typedef avxb Mask;  // mask type for us
+  enum { size = 8 };  // number of SIMD elements
+  union {             // data
+    __m256i m256;
+#if !defined(__KERNEL_AVX2__)
+    struct {
+      __m128i l, h;
+    };
+#endif
+    int32_t v[8];
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Constructors, Assignment & Cast Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline avxi()
+  {
+  }
+  __forceinline avxi(const avxi &a)
+  {
+    m256 = a.m256;
+  }
+  __forceinline avxi &operator=(const avxi &a)
+  {
+    m256 = a.m256;
+    return *this;
+  }
+
+  __forceinline avxi(const __m256i a) : m256(a)
+  {
+  }
+  __forceinline operator const __m256i &(void)const
+  {
+    return m256;
+  }
+  __forceinline operator __m256i &(void)
+  {
+    return m256;
+  }
+
+  __forceinline explicit avxi(const ssei &a)
+      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1))
+  {
+  }
+  __forceinline avxi(const ssei &a, const ssei &b)
+      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
+  {
+  }
+#if defined(__KERNEL_AVX2__)
+  __forceinline avxi(const __m128i &a, const __m128i &b)
+      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
+  {
+  }
+#else
+  __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b)
+  {
+  }
+#endif
+  __forceinline explicit avxi(const int32_t *const a)
+      : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a)))
+  {
+  }
+  __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a))
+  {
+  }
+  __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a))
+  {
+  }
+  __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d)
+      : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a))
+  {
+  }
+  __forceinline avxi(
+      int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h)
+      : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a))
+  {
+  }
+
+  __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a))
+  {
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Constants
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256())
+  {
+  }
+#if defined(__KERNEL_AVX2__)
+  __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1))
+  {
+  }
+  __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf))
+  {
+  }
+  __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf))
+  {
+  }
+#else
+  __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1))
+  {
+  }
+  __forceinline avxi(PosInfTy)
+      : m256(_mm256_set_epi32(
+            pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf))
+  {
+  }
+  __forceinline avxi(NegInfTy)
+      : m256(_mm256_set_epi32(
+            neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf))
+  {
+  }
+#endif
+  __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))
+  {
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Array Access
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const int32_t &operator[](const size_t i) const
+  {
+    assert(i < 8);
+    return v[i];
+  }
+  __forceinline int32_t &operator[](const size_t i)
+  {
+    assert(i < 8);
+    return v[i];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxi cast(const __m256 &a)
+{
+  return _mm256_castps_si256(a);
+}
+__forceinline const avxi operator+(const avxi &a)
+{
+  return a;
+}
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator-(const avxi &a)
+{
+  return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256);
+}
+__forceinline const avxi abs(const avxi &a)
+{
+  return _mm256_abs_epi32(a.m256);
+}
+#else
+__forceinline const avxi operator-(const avxi &a)
+{
+  return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h));
+}
+__forceinline const avxi abs(const avxi &a)
+{
+  return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h));
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator+(const avxi &a, const avxi &b)
+{
+  return _mm256_add_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator+(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator+(const avxi &a, const int32_t b)
+{
+  return a + avxi(b);
+}
+__forceinline const avxi operator+(const int32_t a, const avxi &b)
+{
+  return avxi(a) + b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator-(const avxi &a, const avxi &b)
+{
+  return _mm256_sub_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator-(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator-(const avxi &a, const int32_t b)
+{
+  return a - avxi(b);
+}
+__forceinline const avxi operator-(const int32_t a, const avxi &b)
+{
+  return avxi(a) - b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator*(const avxi &a, const avxi &b)
+{
+  return _mm256_mullo_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator*(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator*(const avxi &a, const int32_t b)
+{
+  return a * avxi(b);
+}
+__forceinline const avxi operator*(const int32_t a, const avxi &b)
+{
+  return avxi(a) * b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator&(const avxi &a, const avxi &b)
+{
+  return _mm256_and_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator&(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator&(const avxi &a, const int32_t b)
+{
+  return a & avxi(b);
+}
+__forceinline const avxi operator&(const int32_t a, const avxi &b)
+{
+  return avxi(a) & b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator|(const avxi &a, const avxi &b)
+{
+  return _mm256_or_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator|(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator|(const avxi &a, const int32_t b)
+{
+  return a | avxi(b);
+}
+__forceinline const avxi operator|(const int32_t a, const avxi &b)
+{
+  return avxi(a) | b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator^(const avxi &a, const avxi &b)
+{
+  return _mm256_xor_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator^(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator^(const avxi &a, const int32_t b)
+{
+  return a ^ avxi(b);
+}
+__forceinline const avxi operator^(const int32_t a, const avxi &b)
+{
+  return avxi(a) ^ b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator<<(const avxi &a, const int32_t n)
+{
+  return _mm256_slli_epi32(a.m256, n);
+}
+__forceinline const avxi operator>>(const avxi &a, const int32_t n)
+{
+  return _mm256_srai_epi32(a.m256, n);
+}
+
+__forceinline const avxi sra(const avxi &a, const int32_t b)
+{
+  return _mm256_srai_epi32(a.m256, b);
+}
+__forceinline const avxi srl(const avxi &a, const int32_t b)
+{
+  return _mm256_srli_epi32(a.m256, b);
+}
+#else
+__forceinline const avxi operator<<(const avxi &a, const int32_t n)
+{
+  return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n));
+}
+__forceinline const avxi operator>>(const avxi &a, const int32_t n)
+{
+  return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n));
+}
+
+__forceinline const avxi sra(const avxi &a, const int32_t b)
+{
+  return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b));
+}
+__forceinline const avxi srl(const avxi &a, const int32_t b)
+{
+  return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b));
+}
+#endif
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi min(const avxi &a, const avxi &b)
+{
+  return _mm256_min_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi min(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi min(const avxi &a, const int32_t b)
+{
+  return min(a, avxi(b));
+}
+__forceinline const avxi min(const int32_t a, const avxi &b)
+{
+  return min(avxi(a), b);
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi max(const avxi &a, const avxi &b)
+{
+  return _mm256_max_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi max(const avxi &a, const avxi &b)
+{
+  return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi max(const avxi &a, const int32_t b)
+{
+  return max(a, avxi(b));
+}
+__forceinline const avxi max(const int32_t a, const avxi &b)
+{
+  return max(avxi(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxi &operator+=(avxi &a, const avxi &b)
+{
+  return a = a + b;
+}
+__forceinline avxi &operator+=(avxi &a, const int32_t b)
+{
+  return a = a + b;
+}
+
+__forceinline avxi &operator-=(avxi &a, const avxi &b)
+{
+  return a = a - b;
+}
+__forceinline avxi &operator-=(avxi &a, const int32_t b)
+{
+  return a = a - b;
+}
+
+__forceinline avxi &operator*=(avxi &a, const avxi &b)
+{
+  return a = a * b;
+}
+__forceinline avxi &operator*=(avxi &a, const int32_t b)
+{
+  return a = a * b;
+}
+
+__forceinline avxi &operator&=(avxi &a, const avxi &b)
+{
+  return a = a & b;
+}
+__forceinline avxi &operator&=(avxi &a, const int32_t b)
+{
+  return a = a & b;
+}
+
+__forceinline avxi &operator|=(avxi &a, const avxi &b)
+{
+  return a = a | b;
+}
+__forceinline avxi &operator|=(avxi &a, const int32_t b)
+{
+  return a = a | b;
+}
+
+__forceinline avxi &operator^=(avxi &a, const avxi &b)
+{
+  return a = a ^ b;
+}
+__forceinline avxi &operator^=(avxi &a, const int32_t b)
+{
+  return a = a ^ b;
+}
+
+__forceinline avxi &operator<<=(avxi &a, const int32_t b)
+{
+  return a = a << b;
+}
+__forceinline avxi &operator>>=(avxi &a, const int32_t b)
+{
+  return a = a >> b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator==(const avxi &a, const avxi &b)
+{
+  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256));
+}
+#else
+__forceinline const avxb operator==(const avxi &a, const avxi &b)
+{
+  return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)),
+              _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator==(const avxi &a, const int32_t b)
+{
+  return a == avxi(b);
+}
+__forceinline const avxb operator==(const int32_t a, const avxi &b)
+{
+  return avxi(a) == b;
+}
+
+__forceinline const avxb operator!=(const avxi &a, const avxi &b)
+{
+  return !(a == b);
+}
+__forceinline const avxb operator!=(const avxi &a, const int32_t b)
+{
+  return a != avxi(b);
+}
+__forceinline const avxb operator!=(const int32_t a, const avxi &b)
+{
+  return avxi(a) != b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator<(const avxi &a, const avxi &b)
+{
+  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256));
+}
+#else
+__forceinline const avxb operator<(const avxi &a, const avxi &b)
+{
+  return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)),
+              _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator<(const avxi &a, const int32_t b)
+{
+  return a < avxi(b);
+}
+__forceinline const avxb operator<(const int32_t a, const avxi &b)
+{
+  return avxi(a) < b;
+}
+
+__forceinline const avxb operator>=(const avxi &a, const avxi &b)
+{
+  return !(a < b);
+}
+__forceinline const avxb operator>=(const avxi &a, const int32_t b)
+{
+  return a >= avxi(b);
+}
+__forceinline const avxb operator>=(const int32_t a, const avxi &b)
+{
+  return avxi(a) >= b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator>(const avxi &a, const avxi &b)
+{
+  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256));
+}
+#else
+__forceinline const avxb operator>(const avxi &a, const avxi &b)
+{
+  return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)),
+              _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator>(const avxi &a, const int32_t b)
+{
+  return a > avxi(b);
+}
+__forceinline const avxb operator>(const int32_t a, const avxi &b)
+{
+  return avxi(a) > b;
+}
+
+__forceinline const avxb operator<=(const avxi &a, const avxi &b)
+{
+  return !(a > b);
+}
+__forceinline const avxb operator<=(const avxi &a, const int32_t b)
+{
+  return a <= avxi(b);
+}
+__forceinline const avxb operator<=(const int32_t a, const avxi &b)
+{
+  return avxi(a) <= b;
+}
+
+__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f)
+{
+  return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline avxi unpacklo(const avxi &a, const avxi &b)
+{
+  return _mm256_unpacklo_epi32(a.m256, b.m256);
+}
+__forceinline avxi unpackhi(const avxi &a, const avxi &b)
+{
+  return _mm256_unpackhi_epi32(a.m256, b.m256);
+}
+#else
+__forceinline avxi unpacklo(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+__forceinline avxi unpackhi(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+
+template<size_t i> __forceinline const avxi shuffle(const avxi &a)
+{
+  return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i)));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a)
+{
+  return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b)
+{
+  return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const avxi shuffle(const avxi &a)
+{
+  return _mm256_castps_si256(
+      _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const avxi shuffle(const avxi &a, const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b)));
+}
+template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b)
+{
+  return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b)));
+}
+template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b)
+{
+  return _mm256_castps_si256(
+      _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b)))));
+}
+
+__forceinline const avxi broadcast(const int *ptr)
+{
+  return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr));
+}
+template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b)
+{
+  return _mm256_insertf128_si256(a, b, i);
+}
+template<size_t i> __forceinline const ssei extract(const avxi &a)
+{
+  return _mm256_extractf128_si256(a, i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reductions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxi vreduce_min2(const avxi &v)
+{
+  return min(v, shuffle<1, 0, 3, 2>(v));
+}
+__forceinline const avxi vreduce_min4(const avxi &v)
+{
+  avxi v1 = vreduce_min2(v);
+  return min(v1, shuffle<2, 3, 0, 1>(v1));
+}
+__forceinline const avxi vreduce_min(const avxi &v)
+{
+  avxi v1 = vreduce_min4(v);
+  return min(v1, shuffle<1, 0>(v1));
+}
+
+__forceinline const avxi vreduce_max2(const avxi &v)
+{
+  return max(v, shuffle<1, 0, 3, 2>(v));
+}
+__forceinline const avxi vreduce_max4(const avxi &v)
+{
+  avxi v1 = vreduce_max2(v);
+  return max(v1, shuffle<2, 3, 0, 1>(v1));
+}
+__forceinline const avxi vreduce_max(const avxi &v)
+{
+  avxi v1 = vreduce_max4(v);
+  return max(v1, shuffle<1, 0>(v1));
+}
+
+__forceinline const avxi vreduce_add2(const avxi &v)
+{
+  return v + shuffle<1, 0, 3, 2>(v);
+}
+__forceinline const avxi vreduce_add4(const avxi &v)
+{
+  avxi v1 = vreduce_add2(v);
+  return v1 + shuffle<2, 3, 0, 1>(v1);
+}
+__forceinline const avxi vreduce_add(const avxi &v)
+{
+  avxi v1 = vreduce_add4(v);
+  return v1 + shuffle<1, 0>(v1);
+}
+
+__forceinline int reduce_min(const avxi &v)
+{
+  return extract<0>(extract<0>(vreduce_min(v)));
+}
+__forceinline int reduce_max(const avxi &v)
+{
+  return extract<0>(extract<0>(vreduce_max(v)));
+}
+__forceinline int reduce_add(const avxi &v)
+{
+  return extract<0>(extract<0>(vreduce_add(v)));
+}
+
+__forceinline size_t select_min(const avxi &v)
+{
+  return __bsf(movemask(v == vreduce_min(v)));
+}
+__forceinline size_t select_max(const avxi &v)
+{
+  return __bsf(movemask(v == vreduce_max(v)));
+}
+
+__forceinline size_t select_min(const avxb &valid, const avxi &v)
+{
+  const avxi a = select(valid, v, avxi(pos_inf));
+  return __bsf(movemask(valid & (a == vreduce_min(a))));
+}
+__forceinline size_t select_max(const avxb &valid, const avxi &v)
+{
+  const avxi a = select(valid, v, avxi(neg_inf));
+  return __bsf(movemask(valid & (a == vreduce_max(a))));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Output Operators
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_avxi(const char *label, const avxi &a)
+{
+  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index ca48758efcd..0021eec169b 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -312,6 +312,60 @@ ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
   return c;
 }
 
+#  if defined(__KERNEL_AVX__)
+ccl_device_inline avxi hash_avxi(avxi kx)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (1 << 2) + 13);
+
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (2 << 2) + 13);
+
+  b += ky;
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (3 << 2) + 13);
+
+  c += kz;
+  b += ky;
+  a += kx;
+  final(a, b, c);
+
+  return c;
+}
+
+ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
+{
+  avxi a, b, c;
+  a = b = c = avxi(0xdeadbeef + (4 << 2) + 13);
+
+  a += kx;
+  b += ky;
+  c += kz;
+  mix(a, b, c);
+
+  a += kw;
+  final(a, b, c);
+
+  return c;
+}
+#  endif
+
 #  undef rot
 #  undef final
 #  undef mix
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index f49cfb4184d..de0e3c39f30 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -75,6 +75,28 @@ static struct FalseTy {
   }
 } False ccl_maybe_unused;
 
+static struct ZeroTy {
+  __forceinline operator float() const
+  {
+    return 0;
+  }
+  __forceinline operator int() const
+  {
+    return 0;
+  }
+} zero ccl_maybe_unused;
+
+static struct OneTy {
+  __forceinline operator float() const
+  {
+    return 1;
+  }
+  __forceinline operator int() const
+  {
+    return 1;
+  }
+} one ccl_maybe_unused;
+
 static struct NegInfTy {
   __forceinline operator float() const
   {
@@ -97,6 +119,9 @@ static struct PosInfTy {
   }
 } inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
 
+static struct StepTy {
+} step ccl_maybe_unused;
+
 /* Intrinsics Functions */
 
 #    if defined(__BMI__) && defined(__GNUC__)
@@ -563,6 +588,13 @@ __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
 
 #    endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
+/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
+ * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
+#    if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+#      undef _mm256_cvtss_f32
+#      define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
+#    endif
+
 #  else /* __KERNEL_SSE2__ */
 
 /* This section is for utility functions which operates on non-register data
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index f6535848480..a721595667d 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -158,6 +158,7 @@ CCL_NAMESPACE_END
 #  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
 #    include "util/util_avxb.h"
 #    include "util/util_avxf.h"
+#    include "util/util_avxi.h"
 #  endif
 #endif
author	Jacques Lucke <jacques@blender.org>	2020-04-06 12:10:19 +0300
committer	Jacques Lucke <jacques@blender.org>	2020-04-06 12:10:19 +0300
commit	43f895a59247ea4058cb3f019cd4dabd9ad9b0e4 (patch)
tree	51469fde8affa6b20f67d12a4602c5f109c1a204 /intern/cycles
parent	52606afaa60462db45e783607255f56c06fd8d73 (diff)
parent	480ff89bf7cfb1f9ffd5ce66fbd5c65288ef04c0 (diff)