Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJacques Lucke <jacques@blender.org>2020-04-06 12:10:19 +0300
committerJacques Lucke <jacques@blender.org>2020-04-06 12:10:19 +0300
commit43f895a59247ea4058cb3f019cd4dabd9ad9b0e4 (patch)
tree51469fde8affa6b20f67d12a4602c5f109c1a204 /intern/cycles
parent52606afaa60462db45e783607255f56c06fd8d73 (diff)
parent480ff89bf7cfb1f9ffd5ce66fbd5c65288ef04c0 (diff)
Merge branch 'master' into functions
Diffstat (limited to 'intern/cycles')
-rw-r--r--intern/cycles/blender/addon/engine.py13
-rw-r--r--intern/cycles/blender/blender_sync.cpp14
-rw-r--r--intern/cycles/kernel/svm/svm_noise.h261
-rw-r--r--intern/cycles/render/buffers.cpp89
-rw-r--r--intern/cycles/render/film.cpp3
-rw-r--r--intern/cycles/render/film.h1
-rw-r--r--intern/cycles/util/util_avxb.h28
-rw-r--r--intern/cycles/util/util_avxf.h68
-rw-r--r--intern/cycles/util/util_avxi.h745
-rw-r--r--intern/cycles/util/util_hash.h54
-rw-r--r--intern/cycles/util/util_simd.h32
-rw-r--r--intern/cycles/util/util_types.h1
12 files changed, 1175 insertions, 134 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 2b872bb5c39..a1b063430f5 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -260,15 +260,16 @@ def list_render_passes(srl):
if crl.use_pass_volume_indirect: yield ("VolumeInd", "RGB", 'COLOR')
# Cryptomatte passes.
+ crypto_depth = (crl.pass_crypto_depth + 1) // 2
if crl.use_pass_crypto_object:
- for i in range(0, crl.pass_crypto_depth, 2):
- yield ("CryptoObject" + '{:02d}'.format(i//2), "RGBA", 'COLOR')
+ for i in range(0, crypto_depth):
+ yield ("CryptoObject" + '{:02d}'.format(i), "RGBA", 'COLOR')
if crl.use_pass_crypto_material:
- for i in range(0, crl.pass_crypto_depth, 2):
- yield ("CryptoMaterial" + '{:02d}'.format(i//2), "RGBA", 'COLOR')
+ for i in range(0, crypto_depth):
+ yield ("CryptoMaterial" + '{:02d}'.format(i), "RGBA", 'COLOR')
if srl.cycles.use_pass_crypto_asset:
- for i in range(0, srl.cycles.pass_crypto_depth, 2):
- yield ("CryptoAsset" + '{:02d}'.format(i//2), "RGBA", 'COLOR')
+ for i in range(0, crypto_depth):
+ yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR')
# Denoising passes.
if crl.use_denoising or crl.denoising_store_passes:
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 28a737c3341..9e95cdb3f20 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -633,12 +633,12 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
/* Cryptomatte stores two ID/weight pairs per RGBA layer.
* User facing parameter is the number of pairs. */
- int crypto_depth = min(16, get_int(crp, "pass_crypto_depth"));
+ int crypto_depth = divide_up(min(16, get_int(crp, "pass_crypto_depth")), 2);
scene->film->cryptomatte_depth = crypto_depth;
scene->film->cryptomatte_passes = CRYPT_NONE;
if (get_boolean(crp, "use_pass_crypto_object")) {
- for (int i = 0; i < crypto_depth; i += 2) {
- string passname = cryptomatte_prefix + string_printf("Object%02d", i / 2);
+ for (int i = 0; i < crypto_depth; i++) {
+ string passname = cryptomatte_prefix + string_printf("Object%02d", i);
b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
}
@@ -646,8 +646,8 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
CRYPT_OBJECT);
}
if (get_boolean(crp, "use_pass_crypto_material")) {
- for (int i = 0; i < crypto_depth; i += 2) {
- string passname = cryptomatte_prefix + string_printf("Material%02d", i / 2);
+ for (int i = 0; i < crypto_depth; i++) {
+ string passname = cryptomatte_prefix + string_printf("Material%02d", i);
b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
}
@@ -655,8 +655,8 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
CRYPT_MATERIAL);
}
if (get_boolean(crp, "use_pass_crypto_asset")) {
- for (int i = 0; i < crypto_depth; i += 2) {
- string passname = cryptomatte_prefix + string_printf("Asset%02d", i / 2);
+ for (int i = 0; i < crypto_depth; i++) {
+ string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
}
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index a16b226d8de..914ef2089a9 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -65,7 +65,7 @@ ccl_device_noinline_cpu float perlin_1d(float x)
* supported, we do a standard implementation, but if it is supported, we
* do an implementation using SSE intrinsics.
*/
-#ifndef __KERNEL_SSE2__
+#if !defined(__KERNEL_SSE2__)
/* ** Standard Implementation ** */
@@ -266,7 +266,7 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
return r;
}
-#else
+#else /* SSE is supported. */
/* ** SSE Implementation ** */
@@ -300,6 +300,57 @@ ccl_device_inline ssef bi_mix(ssef p, ssef f)
return mix(g, shuffle<1>(g), shuffle<1>(f));
}
+ccl_device_inline ssef fade(const ssef &t)
+{
+ ssef a = madd(t, 6.0f, -15.0f);
+ ssef b = madd(t, a, 10.0f);
+ return (t * t) * (t * b);
+}
+
+/* Negate val if the nth bit of h is 1. */
+# define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))
+
+ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
+{
+ ssei h = hash & 7;
+ ssef u = select(h < 4, x, y);
+ ssef v = 2.0f * select(h < 4, y, x);
+ return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
+}
+
+/* We use SSE to compute and interpolate 4 gradients at once:
+ *
+ * Point Offset from v0
+ * v0 (0, 0)
+ * v1 (0, 1)
+ * v2 (1, 0) (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(V, V + 1))
+ * v3 (1, 1) ^
+ * | |__________| (0, 0, 1, 1) = shuffle<0, 0, 0, 0>(V, V + 1)
+ * | ^
+ * |__________________________|
+ *
+ */
+ccl_device_noinline float perlin_2d(float x, float y)
+{
+ ssei XY;
+ ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
+ ssef uv = fade(fxy);
+
+ ssei XY1 = XY + 1;
+ ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
+ ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
+
+ ssei h = hash_ssei2(X, Y);
+
+ ssef fxy1 = fxy - 1.0f;
+ ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
+ ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
+
+ ssef g = grad(h, fx, fy);
+
+ return extract<0>(bi_mix(g, uv));
+}
+
/* SSE Trilinear Interpolation:
*
* The function takes three ssef inputs:
@@ -340,34 +391,12 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
return mix(g, shuffle<1>(g), shuffle<2>(f));
}
-/* SSE Quadrilinear Interpolation:
- *
- * Quadrilinear interpolation is as simple as a linear interpolation
- * between two trilinear interpolations.
- *
+/* 3D and 4D noise can be accelerated using AVX, so we first check if AVX
+ * is supported, that is, if __KERNEL_AVX__ is defined. If it is not
+ * supported, we do an SSE implementation, but if it is supported,
+ * we do an implementation using AVX intrinsics.
*/
-ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
-{
- return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
-}
-
-ccl_device_inline ssef fade(const ssef &t)
-{
- ssef a = madd(t, 6.0f, -15.0f);
- ssef b = madd(t, a, 10.0f);
- return (t * t) * (t * b);
-}
-
-/* Negate val if the nth bit of h is 1. */
-# define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))
-
-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
-{
- ssei h = hash & 7;
- ssef u = select(h < 4, x, y);
- ssef v = 2.0f * select(h < 4, y, x);
- return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
-}
+# if !defined(__KERNEL_AVX__)
ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
{
@@ -388,37 +417,15 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &
return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
}
-/* We use SSE to compute and interpolate 4 gradients at once:
+/* SSE Quadrilinear Interpolation:
*
- * Point Offset from v0
- * v0 (0, 0)
- * v1 (0, 1)
- * v2 (1, 0) (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(V, V + 1))
- * v3 (1, 1) ^
- * | |__________| (0, 0, 1, 1) = shuffle<0, 0, 0, 0>(V, V + 1)
- * | ^
- * |__________________________|
+ * Quadrilinear interpolation is as simple as a linear interpolation
+ * between two trilinear interpolations.
*
*/
-ccl_device_noinline float perlin_2d(float x, float y)
+ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
{
- ssei XY;
- ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
- ssef uv = fade(fxy);
-
- ssei XY1 = XY + 1;
- ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
- ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
-
- ssei h = hash_ssei2(X, Y);
-
- ssef fxy1 = fxy - 1.0f;
- ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
- ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
-
- ssef g = grad(h, fx, fy);
-
- return extract<0>(bi_mix(g, uv));
+ return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
}
/* We use SSE to compute and interpolate 4 gradients at once. Since we have 8
@@ -522,6 +529,148 @@ ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
return extract<0>(quad_mix(g1, g2, g3, g4, uvws));
}
+
+# else /* AVX is supported. */
+
+/* AVX Implementation */
+
+ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z)
+{
+ avxi h = hash & 15;
+ avxf u = select(h < 8, x, y);
+ avxf vt = select((h == 12) | (h == 14), x, z);
+ avxf v = select(h < 4, y, vt);
+ return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
+}
+
+ccl_device_inline avxf
+grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w)
+{
+ avxi h = hash & 31;
+ avxf u = select(h < 24, x, y);
+ avxf v = select(h < 16, y, z);
+ avxf s = select(h < 8, z, w);
+ return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
+}
+
+/* SSE Quadrilinear Interpolation:
+ *
+ * The interpolation is done in two steps:
+ * 1. Interpolate p and q along the w axis to get s.
+ * 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final
+ * value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the
+ * low and high ssef from s.
+ *
+ */
+ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
+{
+ ssef fv = shuffle<3>(f);
+ avxf s = mix(p, q, avxf(fv, fv));
+ return tri_mix(low(s), high(s), f);
+}
+
+/* We use AVX to compute and interpolate 8 gradients at once.
+ *
+ * Point Offset from v0
+ * v0 (0, 0, 0)
+ * v1 (0, 0, 1) The full avx type is computed by inserting the following
+ * v2 (0, 1, 0) sse types into both the low and high parts of the avx.
+ * v3 (0, 1, 1)
+ * v4 (1, 0, 0)
+ * v5 (1, 0, 1) (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
+ * v6 (1, 1, 0) ^
+ * v7 (1, 1, 1) |
+ * | |__________| (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1)
+ * | ^
+ * |__________________________|
+ *
+ */
+ccl_device_noinline float perlin_3d(float x, float y, float z)
+{
+ ssei XYZ;
+ ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
+ ssef uvw = fade(fxyz);
+
+ ssei XYZ1 = XYZ + 1;
+ ssei X = shuffle<0>(XYZ);
+ ssei X1 = shuffle<0>(XYZ1);
+ ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+ ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+
+ avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z));
+
+ ssef fxyz1 = fxyz - 1.0f;
+ ssef fx = shuffle<0>(fxyz);
+ ssef fx1 = shuffle<0>(fxyz1);
+ ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+ ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+
+ avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz));
+
+ return extract<0>(tri_mix(low(g), high(g), uvw));
+}
+
+/* We use AVX to compute and interpolate 8 gradients at once. Since we have 16
+ * gradients in 4D, we need to compute two sets of gradients at the points:
+ *
+ * Point Offset from v0
+ * v0 (0, 0, 0, 0)
+ * v1 (0, 0, 1, 0) The full avx type is computed by inserting the following
+ * v2 (0, 1, 0, 0) sse types into both the low and high parts of the avx.
+ * v3 (0, 1, 1, 0)
+ * v4 (1, 0, 0, 0)
+ * v5 (1, 0, 1, 0) (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
+ * v6 (1, 1, 0, 0) ^
+ * v7 (1, 1, 1, 0) |
+ * | |________| (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1)
+ * | ^
+ * |_______________________|
+ *
+ * Point Offset from v0
+ * v8 (0, 0, 0, 1)
+ * v9 (0, 0, 1, 1)
+ * v10 (0, 1, 0, 1)
+ * v11 (0, 1, 1, 1)
+ * v12 (1, 0, 0, 1)
+ * v13 (1, 0, 1, 1)
+ * v14 (1, 1, 0, 1)
+ * v15 (1, 1, 1, 1)
+ *
+ */
+ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+{
+ ssei XYZW;
+ ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
+ ssef uvws = fade(fxyzw);
+
+ ssei XYZW1 = XYZW + 1;
+ ssei X = shuffle<0>(XYZW);
+ ssei X1 = shuffle<0>(XYZW1);
+ ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+ ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+ ssei W = shuffle<3>(XYZW);
+ ssei W1 = shuffle<3>(XYZW1);
+
+ avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W));
+ avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1));
+
+ ssef fxyzw1 = fxyzw - 1.0f;
+ ssef fx = shuffle<0>(fxyzw);
+ ssef fx1 = shuffle<0>(fxyzw1);
+ ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+ ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+ ssef fw = shuffle<3>(fxyzw);
+ ssef fw1 = shuffle<3>(fxyzw1);
+
+ avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw));
+ avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1));
+
+ return extract<0>(quad_mix(g1, g2, uvws));
+}
+# endif
+
+# undef negate_if_nth_bit
+
#endif
/* Remap the output of noise to a predictable range [-1, 1].
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 2d89fb9ffba..22db8e875dc 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -165,6 +165,35 @@ bool RenderBuffers::copy_from_device()
return true;
}
+static const float *get_sample_count_pass(const vector<Pass> &passes, device_vector<float> &buffer)
+{
+ int sample_offset = 0;
+
+ for (const Pass &pass : passes) {
+ if (pass.type != PASS_SAMPLE_COUNT) {
+ sample_offset += pass.components;
+ }
+ else {
+ return buffer.data() + sample_offset;
+ }
+ }
+
+ return NULL;
+}
+
+static float get_pixel_pass_scale(const float rcp_sample,
+ const float *sample_count,
+ const int i,
+ const int pass_stride)
+{
+ if (sample_count) {
+ return 1.0f / fabsf(sample_count[i * pass_stride]);
+ }
+ else {
+ return rcp_sample;
+ }
+}
+
bool RenderBuffers::get_denoising_pass_rect(
int type, float exposure, int sample, int components, float *pixels)
{
@@ -260,22 +289,7 @@ bool RenderBuffers::get_pass_rect(
return false;
}
- float *sample_count = NULL;
- if (name == "Combined") {
- int sample_offset = 0;
- for (size_t j = 0; j < params.passes.size(); j++) {
- Pass &pass = params.passes[j];
- if (pass.type != PASS_SAMPLE_COUNT) {
- sample_offset += pass.components;
- continue;
- }
- else {
- sample_count = buffer.data() + sample_offset;
- break;
- }
- }
- }
-
+ const float *sample_count = get_sample_count_pass(params.passes, buffer);
int pass_offset = 0;
for (size_t j = 0; j < params.passes.size(); j++) {
@@ -293,8 +307,8 @@ bool RenderBuffers::get_pass_rect(
float *in = buffer.data() + pass_offset;
int pass_stride = params.get_passes_size();
- float scale = (pass.filter) ? 1.0f / (float)sample : 1.0f;
- float scale_exposure = (pass.exposure) ? scale * exposure : scale;
+ const float rcp_sample = 1.0f / (float)sample;
+ const float pass_exposure = (pass.exposure) ? exposure : 1.0f;
int size = params.width * params.height;
@@ -312,28 +326,36 @@ bool RenderBuffers::get_pass_rect(
if (type == PASS_DEPTH) {
for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
float f = *in;
- pixels[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
+ pixels[0] = (f == 0.0f) ? 1e10f : f;
+ }
+ }
+ else if (type == PASS_OBJECT_ID || type == PASS_MATERIAL_ID) {
+ for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
+ pixels[0] = *in;
}
}
else if (type == PASS_MIST) {
for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
- float f = *in;
- pixels[0] = saturate(f * scale_exposure);
+ const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+ const float f = *in;
+ pixels[0] = saturate(f * scale);
}
}
#ifdef WITH_CYCLES_DEBUG
else if (type == PASS_BVH_TRAVERSED_NODES || type == PASS_BVH_TRAVERSED_INSTANCES ||
type == PASS_BVH_INTERSECTIONS || type == PASS_RAY_BOUNCES) {
for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
- float f = *in;
+ const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+ const float f = *in;
pixels[0] = f * scale;
}
}
#endif
else {
for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
- float f = *in;
- pixels[0] = f * scale_exposure;
+ const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+ const float f = *in;
+ pixels[0] = f * scale * pass_exposure;
}
}
}
@@ -367,7 +389,7 @@ bool RenderBuffers::get_pass_rect(
float3 f = make_float3(in[0], in[1], in[2]);
float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
- f = safe_divide_even_color(f * exposure, f_divide);
+ f = safe_divide_even_color(f * pass_exposure, f_divide);
pixels[0] = f.x;
pixels[1] = f.y;
@@ -377,7 +399,9 @@ bool RenderBuffers::get_pass_rect(
else {
/* RGB/vector */
for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
- float3 f = make_float3(in[0], in[1], in[2]);
+ const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+ const float scale_exposure = scale * pass_exposure;
+ const float3 f = make_float3(in[0], in[1], in[2]);
pixels[0] = f.x * scale_exposure;
pixels[1] = f.y * scale_exposure;
@@ -425,7 +449,9 @@ bool RenderBuffers::get_pass_rect(
}
else if (type == PASS_CRYPTOMATTE) {
for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
- float4 f = make_float4(in[0], in[1], in[2], in[3]);
+ const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+ const float4 f = make_float4(in[0], in[1], in[2], in[3]);
+
/* x and z contain integer IDs, don't rescale them.
y and w contain matte weights, they get scaled. */
pixels[0] = f.x;
@@ -436,12 +462,9 @@ bool RenderBuffers::get_pass_rect(
}
else {
for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
- if (sample_count && sample_count[i * pass_stride] < 0.0f) {
- scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
- scale_exposure = (pass.exposure) ? scale * exposure : scale;
- }
-
- float4 f = make_float4(in[0], in[1], in[2], in[3]);
+ const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride);
+ const float scale_exposure = scale * pass_exposure;
+ const float4 f = make_float4(in[0], in[1], in[2], in[3]);
pixels[0] = f.x * scale_exposure;
pixels[1] = f.y * scale_exposure;
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index baf02901123..c29810d1494 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -76,7 +76,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
Pass pass;
pass.type = type;
- pass.filter = true;
pass.exposure = false;
pass.divide_type = PASS_NONE;
if (name) {
@@ -93,7 +92,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
break;
case PASS_DEPTH:
pass.components = 1;
- pass.filter = false;
break;
case PASS_MIST:
pass.components = 1;
@@ -114,7 +112,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
case PASS_OBJECT_ID:
case PASS_MATERIAL_ID:
pass.components = 1;
- pass.filter = false;
break;
case PASS_EMISSION:
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index aae8fb404b0..0fe4fe89d5e 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -42,7 +42,6 @@ class Pass {
public:
PassType type;
int components;
- bool filter;
bool exposure;
PassType divide_type;
string name;
diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h
index 54dd8068eca..34fafd188de 100644
--- a/intern/cycles/util/util_avxb.h
+++ b/intern/cycles/util/util_avxb.h
@@ -16,7 +16,7 @@
*/
#ifndef __UTIL_AVXB_H__
-# define __UTIL_AVXB_H__
+#define __UTIL_AVXB_H__
CCL_NAMESPACE_BEGIN
@@ -53,6 +53,10 @@ struct avxb {
__forceinline avxb(const __m256 input) : m256(input)
{
}
+ __forceinline avxb(const __m128 &a, const __m128 &b)
+ : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1))
+ {
+ }
__forceinline operator const __m256 &(void)const
{
return m256;
@@ -146,9 +150,9 @@ __forceinline const avxb operator!=(const avxb &a, const avxb &b)
}
__forceinline const avxb operator==(const avxb &a, const avxb &b)
{
-# ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
-# else
+#else
__m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
__m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
__m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
@@ -157,16 +161,16 @@ __forceinline const avxb operator==(const avxb &a, const avxb &b)
__m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
__m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
return _mm256_castsi256_ps(result);
-# endif
+#endif
}
__forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
{
-# if defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE41__)
return _mm256_blendv_ps(f, t, m);
-# else
+#else
return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
-# endif
+#endif
}
////////////////////////////////////////////////////////////////////////////////
@@ -186,18 +190,18 @@ __forceinline const avxb unpackhi(const avxb &a, const avxb &b)
/// Reduction Operations
////////////////////////////////////////////////////////////////////////////////
-# if defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE41__)
__forceinline size_t popcnt(const avxb &a)
{
return __popcnt(_mm256_movemask_ps(a));
}
-# else
+#else
__forceinline size_t popcnt(const avxb &a)
{
return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
bool(a[7]);
}
-# endif
+#endif
__forceinline bool reduce_and(const avxb &a)
{
@@ -234,8 +238,6 @@ ccl_device_inline void print_avxb(const char *label, const avxb &a)
printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
}
-#endif
-
CCL_NAMESPACE_END
-//#endif
+#endif
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
index 156607e65fb..1fb3ded422f 100644
--- a/intern/cycles/util/util_avxf.h
+++ b/intern/cycles/util/util_avxf.h
@@ -15,7 +15,7 @@
*/
#ifndef __UTIL_AVXF_H__
-# define __UTIL_AVXF_H__
+#define __UTIL_AVXF_H__
CCL_NAMESPACE_BEGIN
@@ -140,6 +140,11 @@ __forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2)
/// Unary Operators
////////////////////////////////////////////////////////////////////////////////
+__forceinline const avxf cast(const __m256i &a)
+{
+ return _mm256_castsi256_ps(a);
+}
+
__forceinline const avxf mm256_sqrt(const avxf &a)
{
return _mm256_sqrt_ps(a.m256);
@@ -259,16 +264,35 @@ template<size_t i0> __forceinline const avxf shuffle(const avxf &a)
return shuffle<i0>(a, a);
}
+template<size_t i> __forceinline float extract(const avxf &a)
+{
+ __m256 b = shuffle<i, i, i, i>(a).m256;
+ return _mm256_cvtss_f32(b);
+}
+template<> __forceinline float extract<0>(const avxf &a)
+{
+ return _mm256_cvtss_f32(a.m256);
+}
+
+__forceinline ssef low(const avxf &a)
+{
+ return _mm256_extractf128_ps(a.m256, 0);
+}
+__forceinline ssef high(const avxf &a)
+{
+ return _mm256_extractf128_ps(a.m256, 1);
+}
+
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
__forceinline const avxf permute(const avxf &a)
{
-# ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-# else
+#else
float temp[8];
_mm256_storeu_ps((float *)&temp, a);
return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
-# endif
+#endif
}
template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7>
@@ -309,39 +333,51 @@ __forceinline avxf mini(const avxf &a, const avxf &b)
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c)
{
-# ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
return _mm256_fmadd_ps(a, b, c);
-# else
+#else
return c + (a * b);
-# endif
+#endif
}
__forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c)
{
-# ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
return _mm256_fnmadd_ps(a, b, c);
-# else
+#else
return c - (a * b);
-# endif
+#endif
}
__forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c)
{
-# ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
return _mm256_fmsub_ps(a, b, c);
-# else
+#else
return (a * b) - c;
-# endif
+#endif
}
////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators
+/// Comparison Operators + Select
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxb operator<=(const avxf &a, const avxf &b)
{
return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
}
-#endif
+__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f)
+{
+ return _mm256_blendv_ps(f, t, m);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Common Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t)
+{
+ return madd(t, b, (avxf(1.0f) - t) * a);
+}
#ifndef _mm256_set_m128
# define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
@@ -352,3 +388,5 @@ __forceinline const avxb operator<=(const avxf &a, const avxf &b)
_mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_avxi.h b/intern/cycles/util/util_avxi.h
new file mode 100644
index 00000000000..e658a4f848f
--- /dev/null
+++ b/intern/cycles/util/util_avxi.h
@@ -0,0 +1,745 @@
+/*
+ * Copyright 2009-2013 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXI_H__
+#define __UTIL_AVXI_H__
+
+CCL_NAMESPACE_BEGIN
+
+struct avxb;
+
+struct avxi {
+ typedef avxb Mask; // mask type for us
+ enum { size = 8 }; // number of SIMD elements
+ union { // data
+ __m256i m256;
+#if !defined(__KERNEL_AVX2__)
+ struct {
+ __m128i l, h;
+ };
+#endif
+ int32_t v[8];
+ };
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline avxi()
+ {
+ }
+ __forceinline avxi(const avxi &a)
+ {
+ m256 = a.m256;
+ }
+ __forceinline avxi &operator=(const avxi &a)
+ {
+ m256 = a.m256;
+ return *this;
+ }
+
+ __forceinline avxi(const __m256i a) : m256(a)
+ {
+ }
+ __forceinline operator const __m256i &(void)const
+ {
+ return m256;
+ }
+ __forceinline operator __m256i &(void)
+ {
+ return m256;
+ }
+
+ __forceinline explicit avxi(const ssei &a)
+ : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1))
+ {
+ }
+ __forceinline avxi(const ssei &a, const ssei &b)
+ : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
+ {
+ }
+#if defined(__KERNEL_AVX2__)
+ __forceinline avxi(const __m128i &a, const __m128i &b)
+ : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
+ {
+ }
+#else
+ __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b)
+ {
+ }
+#endif
+ __forceinline explicit avxi(const int32_t *const a)
+ : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a)))
+ {
+ }
+ __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a))
+ {
+ }
+ __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a))
+ {
+ }
+ __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d)
+ : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a))
+ {
+ }
+ __forceinline avxi(
+ int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h)
+ : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a))
+ {
+ }
+
+ __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a))
+ {
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256())
+ {
+ }
+#if defined(__KERNEL_AVX2__)
+ __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1))
+ {
+ }
+ __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf))
+ {
+ }
+ __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf))
+ {
+ }
+#else
+ __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1))
+ {
+ }
+ __forceinline avxi(PosInfTy)
+ : m256(_mm256_set_epi32(
+ pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf))
+ {
+ }
+ __forceinline avxi(NegInfTy)
+ : m256(_mm256_set_epi32(
+ neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf))
+ {
+ }
+#endif
+ __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))
+ {
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const int32_t &operator[](const size_t i) const
+ {
+ assert(i < 8);
+ return v[i];
+ }
+ __forceinline int32_t &operator[](const size_t i)
+ {
+ assert(i < 8);
+ return v[i];
+ }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxi cast(const __m256 &a)
+{
+ return _mm256_castps_si256(a);
+}
+__forceinline const avxi operator+(const avxi &a)
+{
+ return a;
+}
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator-(const avxi &a)
+{
+ return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256);
+}
+__forceinline const avxi abs(const avxi &a)
+{
+ return _mm256_abs_epi32(a.m256);
+}
+#else
+__forceinline const avxi operator-(const avxi &a)
+{
+ return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h));
+}
+__forceinline const avxi abs(const avxi &a)
+{
+ return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h));
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator+(const avxi &a, const avxi &b)
+{
+ return _mm256_add_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator+(const avxi &a, const avxi &b)
+{
+ return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator+(const avxi &a, const int32_t b)
+{
+ return a + avxi(b);
+}
+__forceinline const avxi operator+(const int32_t a, const avxi &b)
+{
+ return avxi(a) + b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator-(const avxi &a, const avxi &b)
+{
+ return _mm256_sub_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator-(const avxi &a, const avxi &b)
+{
+ return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator-(const avxi &a, const int32_t b)
+{
+ return a - avxi(b);
+}
+__forceinline const avxi operator-(const int32_t a, const avxi &b)
+{
+ return avxi(a) - b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator*(const avxi &a, const avxi &b)
+{
+ return _mm256_mullo_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator*(const avxi &a, const avxi &b)
+{
+ return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi operator*(const avxi &a, const int32_t b)
+{
+ return a * avxi(b);
+}
+__forceinline const avxi operator*(const int32_t a, const avxi &b)
+{
+ return avxi(a) * b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator&(const avxi &a, const avxi &b)
+{
+ return _mm256_and_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator&(const avxi &a, const avxi &b)
+{
+ return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator&(const avxi &a, const int32_t b)
+{
+ return a & avxi(b);
+}
+__forceinline const avxi operator&(const int32_t a, const avxi &b)
+{
+ return avxi(a) & b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator|(const avxi &a, const avxi &b)
+{
+ return _mm256_or_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator|(const avxi &a, const avxi &b)
+{
+ return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator|(const avxi &a, const int32_t b)
+{
+ return a | avxi(b);
+}
+__forceinline const avxi operator|(const int32_t a, const avxi &b)
+{
+ return avxi(a) | b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator^(const avxi &a, const avxi &b)
+{
+ return _mm256_xor_si256(a.m256, b.m256);
+}
+#else
+__forceinline const avxi operator^(const avxi &a, const avxi &b)
+{
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+__forceinline const avxi operator^(const avxi &a, const int32_t b)
+{
+ return a ^ avxi(b);
+}
+__forceinline const avxi operator^(const int32_t a, const avxi &b)
+{
+ return avxi(a) ^ b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi operator<<(const avxi &a, const int32_t n)
+{
+ return _mm256_slli_epi32(a.m256, n);
+}
+__forceinline const avxi operator>>(const avxi &a, const int32_t n)
+{
+ return _mm256_srai_epi32(a.m256, n);
+}
+
+__forceinline const avxi sra(const avxi &a, const int32_t b)
+{
+ return _mm256_srai_epi32(a.m256, b);
+}
+__forceinline const avxi srl(const avxi &a, const int32_t b)
+{
+ return _mm256_srli_epi32(a.m256, b);
+}
+#else
+__forceinline const avxi operator<<(const avxi &a, const int32_t n)
+{
+ return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n));
+}
+__forceinline const avxi operator>>(const avxi &a, const int32_t n)
+{
+ return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n));
+}
+
+__forceinline const avxi sra(const avxi &a, const int32_t b)
+{
+ return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b));
+}
+__forceinline const avxi srl(const avxi &a, const int32_t b)
+{
+ return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b));
+}
+#endif
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi min(const avxi &a, const avxi &b)
+{
+ return _mm256_min_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi min(const avxi &a, const avxi &b)
+{
+ return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi min(const avxi &a, const int32_t b)
+{
+ return min(a, avxi(b));
+}
+__forceinline const avxi min(const int32_t a, const avxi &b)
+{
+ return min(avxi(a), b);
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxi max(const avxi &a, const avxi &b)
+{
+ return _mm256_max_epi32(a.m256, b.m256);
+}
+#else
+__forceinline const avxi max(const avxi &a, const avxi &b)
+{
+ return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h));
+}
+#endif
+__forceinline const avxi max(const avxi &a, const int32_t b)
+{
+ return max(a, avxi(b));
+}
+__forceinline const avxi max(const int32_t a, const avxi &b)
+{
+ return max(avxi(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline avxi &operator+=(avxi &a, const avxi &b)
+{
+ return a = a + b;
+}
+__forceinline avxi &operator+=(avxi &a, const int32_t b)
+{
+ return a = a + b;
+}
+
+__forceinline avxi &operator-=(avxi &a, const avxi &b)
+{
+ return a = a - b;
+}
+__forceinline avxi &operator-=(avxi &a, const int32_t b)
+{
+ return a = a - b;
+}
+
+__forceinline avxi &operator*=(avxi &a, const avxi &b)
+{
+ return a = a * b;
+}
+__forceinline avxi &operator*=(avxi &a, const int32_t b)
+{
+ return a = a * b;
+}
+
+__forceinline avxi &operator&=(avxi &a, const avxi &b)
+{
+ return a = a & b;
+}
+__forceinline avxi &operator&=(avxi &a, const int32_t b)
+{
+ return a = a & b;
+}
+
+__forceinline avxi &operator|=(avxi &a, const avxi &b)
+{
+ return a = a | b;
+}
+__forceinline avxi &operator|=(avxi &a, const int32_t b)
+{
+ return a = a | b;
+}
+
+__forceinline avxi &operator^=(avxi &a, const avxi &b)
+{
+ return a = a ^ b;
+}
+__forceinline avxi &operator^=(avxi &a, const int32_t b)
+{
+ return a = a ^ b;
+}
+
+__forceinline avxi &operator<<=(avxi &a, const int32_t b)
+{
+ return a = a << b;
+}
+__forceinline avxi &operator>>=(avxi &a, const int32_t b)
+{
+ return a = a >> b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator==(const avxi &a, const avxi &b)
+{
+ return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256));
+}
+#else
+__forceinline const avxb operator==(const avxi &a, const avxi &b)
+{
+ return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)),
+ _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator==(const avxi &a, const int32_t b)
+{
+ return a == avxi(b);
+}
+__forceinline const avxb operator==(const int32_t a, const avxi &b)
+{
+ return avxi(a) == b;
+}
+
+__forceinline const avxb operator!=(const avxi &a, const avxi &b)
+{
+ return !(a == b);
+}
+__forceinline const avxb operator!=(const avxi &a, const int32_t b)
+{
+ return a != avxi(b);
+}
+__forceinline const avxb operator!=(const int32_t a, const avxi &b)
+{
+ return avxi(a) != b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator<(const avxi &a, const avxi &b)
+{
+ return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256));
+}
+#else
+__forceinline const avxb operator<(const avxi &a, const avxi &b)
+{
+ return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)),
+ _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator<(const avxi &a, const int32_t b)
+{
+ return a < avxi(b);
+}
+__forceinline const avxb operator<(const int32_t a, const avxi &b)
+{
+ return avxi(a) < b;
+}
+
+__forceinline const avxb operator>=(const avxi &a, const avxi &b)
+{
+ return !(a < b);
+}
+__forceinline const avxb operator>=(const avxi &a, const int32_t b)
+{
+ return a >= avxi(b);
+}
+__forceinline const avxb operator>=(const int32_t a, const avxi &b)
+{
+ return avxi(a) >= b;
+}
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const avxb operator>(const avxi &a, const avxi &b)
+{
+ return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256));
+}
+#else
+__forceinline const avxb operator>(const avxi &a, const avxi &b)
+{
+ return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)),
+ _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h)));
+}
+#endif
+__forceinline const avxb operator>(const avxi &a, const int32_t b)
+{
+ return a > avxi(b);
+}
+__forceinline const avxb operator>(const int32_t a, const avxi &b)
+{
+ return avxi(a) > b;
+}
+
+__forceinline const avxb operator<=(const avxi &a, const avxi &b)
+{
+ return !(a > b);
+}
+__forceinline const avxb operator<=(const avxi &a, const int32_t b)
+{
+ return a <= avxi(b);
+}
+__forceinline const avxb operator<=(const int32_t a, const avxi &b)
+{
+ return avxi(a) <= b;
+}
+
+__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f)
+{
+ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline avxi unpacklo(const avxi &a, const avxi &b)
+{
+ return _mm256_unpacklo_epi32(a.m256, b.m256);
+}
+__forceinline avxi unpackhi(const avxi &a, const avxi &b)
+{
+ return _mm256_unpackhi_epi32(a.m256, b.m256);
+}
+#else
+__forceinline avxi unpacklo(const avxi &a, const avxi &b)
+{
+ return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+__forceinline avxi unpackhi(const avxi &a, const avxi &b)
+{
+ return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+#endif
+
+template<size_t i> __forceinline const avxi shuffle(const avxi &a)
+{
+ return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i)));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a)
+{
+ return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b)
+{
+ return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const avxi shuffle(const avxi &a)
+{
+ return _mm256_castps_si256(
+ _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const avxi shuffle(const avxi &a, const avxi &b)
+{
+ return _mm256_castps_si256(_mm256_shuffle_ps(
+ _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b)
+{
+ return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b)));
+}
+template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b)
+{
+ return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b)));
+}
+template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b)
+{
+ return _mm256_castps_si256(
+ _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b)))));
+}
+
+__forceinline const avxi broadcast(const int *ptr)
+{
+ return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr));
+}
+template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b)
+{
+ return _mm256_insertf128_si256(a, b, i);
+}
+template<size_t i> __forceinline const ssei extract(const avxi &a)
+{
+ return _mm256_extractf128_si256(a, i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reductions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxi vreduce_min2(const avxi &v)
+{
+ return min(v, shuffle<1, 0, 3, 2>(v));
+}
+__forceinline const avxi vreduce_min4(const avxi &v)
+{
+ avxi v1 = vreduce_min2(v);
+ return min(v1, shuffle<2, 3, 0, 1>(v1));
+}
+__forceinline const avxi vreduce_min(const avxi &v)
+{
+ avxi v1 = vreduce_min4(v);
+ return min(v1, shuffle<1, 0>(v1));
+}
+
+__forceinline const avxi vreduce_max2(const avxi &v)
+{
+ return max(v, shuffle<1, 0, 3, 2>(v));
+}
+__forceinline const avxi vreduce_max4(const avxi &v)
+{
+ avxi v1 = vreduce_max2(v);
+ return max(v1, shuffle<2, 3, 0, 1>(v1));
+}
+__forceinline const avxi vreduce_max(const avxi &v)
+{
+ avxi v1 = vreduce_max4(v);
+ return max(v1, shuffle<1, 0>(v1));
+}
+
+__forceinline const avxi vreduce_add2(const avxi &v)
+{
+ return v + shuffle<1, 0, 3, 2>(v);
+}
+__forceinline const avxi vreduce_add4(const avxi &v)
+{
+ avxi v1 = vreduce_add2(v);
+ return v1 + shuffle<2, 3, 0, 1>(v1);
+}
+__forceinline const avxi vreduce_add(const avxi &v)
+{
+ avxi v1 = vreduce_add4(v);
+ return v1 + shuffle<1, 0>(v1);
+}
+
+__forceinline int reduce_min(const avxi &v)
+{
+ return extract<0>(extract<0>(vreduce_min(v)));
+}
+__forceinline int reduce_max(const avxi &v)
+{
+ return extract<0>(extract<0>(vreduce_max(v)));
+}
+__forceinline int reduce_add(const avxi &v)
+{
+ return extract<0>(extract<0>(vreduce_add(v)));
+}
+
+__forceinline size_t select_min(const avxi &v)
+{
+ return __bsf(movemask(v == vreduce_min(v)));
+}
+__forceinline size_t select_max(const avxi &v)
+{
+ return __bsf(movemask(v == vreduce_max(v)));
+}
+
+__forceinline size_t select_min(const avxb &valid, const avxi &v)
+{
+ const avxi a = select(valid, v, avxi(pos_inf));
+ return __bsf(movemask(valid & (a == vreduce_min(a))));
+}
+__forceinline size_t select_max(const avxb &valid, const avxi &v)
+{
+ const avxi a = select(valid, v, avxi(neg_inf));
+ return __bsf(movemask(valid & (a == vreduce_max(a))));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Output Operators
+////////////////////////////////////////////////////////////////////////////////
+
+ccl_device_inline void print_avxi(const char *label, const avxi &a)
+{
+ printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index ca48758efcd..0021eec169b 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -312,6 +312,60 @@ ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
return c;
}
+# if defined(__KERNEL_AVX__)
+ccl_device_inline avxi hash_avxi(avxi kx)
+{
+ avxi a, b, c;
+ a = b = c = avxi(0xdeadbeef + (1 << 2) + 13);
+
+ a += kx;
+ final(a, b, c);
+
+ return c;
+}
+
+ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
+{
+ avxi a, b, c;
+ a = b = c = avxi(0xdeadbeef + (2 << 2) + 13);
+
+ b += ky;
+ a += kx;
+ final(a, b, c);
+
+ return c;
+}
+
+ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
+{
+ avxi a, b, c;
+ a = b = c = avxi(0xdeadbeef + (3 << 2) + 13);
+
+ c += kz;
+ b += ky;
+ a += kx;
+ final(a, b, c);
+
+ return c;
+}
+
+ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
+{
+ avxi a, b, c;
+ a = b = c = avxi(0xdeadbeef + (4 << 2) + 13);
+
+ a += kx;
+ b += ky;
+ c += kz;
+ mix(a, b, c);
+
+ a += kw;
+ final(a, b, c);
+
+ return c;
+}
+# endif
+
# undef rot
# undef final
# undef mix
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index f49cfb4184d..de0e3c39f30 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -75,6 +75,28 @@ static struct FalseTy {
}
} False ccl_maybe_unused;
+static struct ZeroTy {
+ __forceinline operator float() const
+ {
+ return 0;
+ }
+ __forceinline operator int() const
+ {
+ return 0;
+ }
+} zero ccl_maybe_unused;
+
+static struct OneTy {
+ __forceinline operator float() const
+ {
+ return 1;
+ }
+ __forceinline operator int() const
+ {
+ return 1;
+ }
+} one ccl_maybe_unused;
+
static struct NegInfTy {
__forceinline operator float() const
{
@@ -97,6 +119,9 @@ static struct PosInfTy {
}
} inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
+static struct StepTy {
+} step ccl_maybe_unused;
+
/* Intrinsics Functions */
# if defined(__BMI__) && defined(__GNUC__)
@@ -563,6 +588,13 @@ __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
# endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
+/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
+ * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
+# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+# undef _mm256_cvtss_f32
+# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
+# endif
+
# else /* __KERNEL_SSE2__ */
/* This section is for utility functions which operates on non-register data
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index f6535848480..a721595667d 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -158,6 +158,7 @@ CCL_NAMESPACE_END
# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
# include "util/util_avxb.h"
# include "util/util_avxf.h"
+# include "util/util_avxi.h"
# endif
#endif