diff options
author | Jacques Lucke <jacques@blender.org> | 2020-04-06 12:10:19 +0300 |
---|---|---|
committer | Jacques Lucke <jacques@blender.org> | 2020-04-06 12:10:19 +0300 |
commit | 43f895a59247ea4058cb3f019cd4dabd9ad9b0e4 (patch) | |
tree | 51469fde8affa6b20f67d12a4602c5f109c1a204 /intern/cycles | |
parent | 52606afaa60462db45e783607255f56c06fd8d73 (diff) | |
parent | 480ff89bf7cfb1f9ffd5ce66fbd5c65288ef04c0 (diff) |
Merge branch 'master' into functions
Diffstat (limited to 'intern/cycles')
-rw-r--r-- | intern/cycles/blender/addon/engine.py | 13 | ||||
-rw-r--r-- | intern/cycles/blender/blender_sync.cpp | 14 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_noise.h | 261 | ||||
-rw-r--r-- | intern/cycles/render/buffers.cpp | 89 | ||||
-rw-r--r-- | intern/cycles/render/film.cpp | 3 | ||||
-rw-r--r-- | intern/cycles/render/film.h | 1 | ||||
-rw-r--r-- | intern/cycles/util/util_avxb.h | 28 | ||||
-rw-r--r-- | intern/cycles/util/util_avxf.h | 68 | ||||
-rw-r--r-- | intern/cycles/util/util_avxi.h | 745 | ||||
-rw-r--r-- | intern/cycles/util/util_hash.h | 54 | ||||
-rw-r--r-- | intern/cycles/util/util_simd.h | 32 | ||||
-rw-r--r-- | intern/cycles/util/util_types.h | 1 |
12 files changed, 1175 insertions, 134 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 2b872bb5c39..a1b063430f5 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -260,15 +260,16 @@ def list_render_passes(srl): if crl.use_pass_volume_indirect: yield ("VolumeInd", "RGB", 'COLOR') # Cryptomatte passes. + crypto_depth = (crl.pass_crypto_depth + 1) // 2 if crl.use_pass_crypto_object: - for i in range(0, crl.pass_crypto_depth, 2): - yield ("CryptoObject" + '{:02d}'.format(i//2), "RGBA", 'COLOR') + for i in range(0, crypto_depth): + yield ("CryptoObject" + '{:02d}'.format(i), "RGBA", 'COLOR') if crl.use_pass_crypto_material: - for i in range(0, crl.pass_crypto_depth, 2): - yield ("CryptoMaterial" + '{:02d}'.format(i//2), "RGBA", 'COLOR') + for i in range(0, crypto_depth): + yield ("CryptoMaterial" + '{:02d}'.format(i), "RGBA", 'COLOR') if srl.cycles.use_pass_crypto_asset: - for i in range(0, srl.cycles.pass_crypto_depth, 2): - yield ("CryptoAsset" + '{:02d}'.format(i//2), "RGBA", 'COLOR') + for i in range(0, crypto_depth): + yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR') # Denoising passes. if crl.use_denoising or crl.denoising_store_passes: diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 28a737c3341..9e95cdb3f20 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -633,12 +633,12 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, /* Cryptomatte stores two ID/weight pairs per RGBA layer. * User facing parameter is the number of pairs. */ - int crypto_depth = min(16, get_int(crp, "pass_crypto_depth")); + int crypto_depth = divide_up(min(16, get_int(crp, "pass_crypto_depth")), 2); scene->film->cryptomatte_depth = crypto_depth; scene->film->cryptomatte_passes = CRYPT_NONE; if (get_boolean(crp, "use_pass_crypto_object")) { - for (int i = 0; i < crypto_depth; i += 2) { - string passname = cryptomatte_prefix + string_printf("Object%02d", i / 2); + for (int i = 0; i < crypto_depth; i++) { + string passname = cryptomatte_prefix + string_printf("Object%02d", i); b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str()); Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str()); } @@ -646,8 +646,8 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, CRYPT_OBJECT); } if (get_boolean(crp, "use_pass_crypto_material")) { - for (int i = 0; i < crypto_depth; i += 2) { - string passname = cryptomatte_prefix + string_printf("Material%02d", i / 2); + for (int i = 0; i < crypto_depth; i++) { + string passname = cryptomatte_prefix + string_printf("Material%02d", i); b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str()); Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str()); } @@ -655,8 +655,8 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, CRYPT_MATERIAL); } if (get_boolean(crp, "use_pass_crypto_asset")) { - for (int i = 0; i < crypto_depth; i += 2) { - string passname = cryptomatte_prefix + string_printf("Asset%02d", i / 2); + for (int i = 0; i < crypto_depth; i++) { + string passname = cryptomatte_prefix + string_printf("Asset%02d", i); b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str()); Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str()); } diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h index a16b226d8de..914ef2089a9 100644 --- a/intern/cycles/kernel/svm/svm_noise.h +++ b/intern/cycles/kernel/svm/svm_noise.h @@ -65,7 +65,7 @@ ccl_device_noinline_cpu float perlin_1d(float x) * supported, we do a standard implementation, but if it is supported, we * do an implementation using SSE intrinsics. */ -#ifndef __KERNEL_SSE2__ +#if !defined(__KERNEL_SSE2__) /* ** Standard Implementation ** */ @@ -266,7 +266,7 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) return r; } -#else +#else /* SSE is supported. */ /* ** SSE Implementation ** */ @@ -300,6 +300,57 @@ ccl_device_inline ssef bi_mix(ssef p, ssef f) return mix(g, shuffle<1>(g), shuffle<1>(f)); } +ccl_device_inline ssef fade(const ssef &t) +{ + ssef a = madd(t, 6.0f, -15.0f); + ssef b = madd(t, a, 10.0f); + return (t * t) * (t * b); +} + +/* Negate val if the nth bit of h is 1. */ +# define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n)))) + +ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y) +{ + ssei h = hash & 7; + ssef u = select(h < 4, x, y); + ssef v = 2.0f * select(h < 4, y, x); + return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1); +} + +/* We use SSE to compute and interpolate 4 gradients at once: + * + * Point Offset from v0 + * v0 (0, 0) + * v1 (0, 1) + * v2 (1, 0) (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(V, V + 1)) + * v3 (1, 1) ^ + * | |__________| (0, 0, 1, 1) = shuffle<0, 0, 0, 0>(V, V + 1) + * | ^ + * |__________________________| + * + */ +ccl_device_noinline float perlin_2d(float x, float y) +{ + ssei XY; + ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY); + ssef uv = fade(fxy); + + ssei XY1 = XY + 1; + ssei X = shuffle<0, 0, 0, 0>(XY, XY1); + ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1)); + + ssei h = hash_ssei2(X, Y); + + ssef fxy1 = fxy - 1.0f; + ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1); + ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1)); + + ssef g = grad(h, fx, fy); + + return extract<0>(bi_mix(g, uv)); +} + /* SSE Trilinear Interpolation: * * The function takes three ssef inputs: @@ -340,34 +391,12 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f) return mix(g, shuffle<1>(g), shuffle<2>(f)); } -/* SSE Quadrilinear Interpolation: - * - * Quadrilinear interpolation is as simple as a linear interpolation - * between two trilinear interpolations. - * +/* 3D and 4D noise can be accelerated using AVX, so we first check if AVX + * is supported, that is, if __KERNEL_AVX__ is defined. If it is not + * supported, we do an SSE implementation, but if it is supported, + * we do an implementation using AVX intrinsics. */ -ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f) -{ - return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f)); -} - -ccl_device_inline ssef fade(const ssef &t) -{ - ssef a = madd(t, 6.0f, -15.0f); - ssef b = madd(t, a, 10.0f); - return (t * t) * (t * b); -} - -/* Negate val if the nth bit of h is 1. */ -# define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n)))) - -ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y) -{ - ssei h = hash & 7; - ssef u = select(h < 4, x, y); - ssef v = 2.0f * select(h < 4, y, x); - return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1); -} +# if !defined(__KERNEL_AVX__) ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z) { @@ -388,37 +417,15 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef & return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2); } -/* We use SSE to compute and interpolate 4 gradients at once: +/* SSE Quadrilinear Interpolation: * - * Point Offset from v0 - * v0 (0, 0) - * v1 (0, 1) - * v2 (1, 0) (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(V, V + 1)) - * v3 (1, 1) ^ - * | |__________| (0, 0, 1, 1) = shuffle<0, 0, 0, 0>(V, V + 1) - * | ^ - * |__________________________| + * Quadrilinear interpolation is as simple as a linear interpolation + * between two trilinear interpolations. * */ -ccl_device_noinline float perlin_2d(float x, float y) +ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f) { - ssei XY; - ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY); - ssef uv = fade(fxy); - - ssei XY1 = XY + 1; - ssei X = shuffle<0, 0, 0, 0>(XY, XY1); - ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1)); - - ssei h = hash_ssei2(X, Y); - - ssef fxy1 = fxy - 1.0f; - ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1); - ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1)); - - ssef g = grad(h, fx, fy); - - return extract<0>(bi_mix(g, uv)); + return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f)); } /* We use SSE to compute and interpolate 4 gradients at once. Since we have 8 @@ -522,6 +529,148 @@ ccl_device_noinline float perlin_4d(float x, float y, float z, float w) return extract<0>(quad_mix(g1, g2, g3, g4, uvws)); } + +# else /* AVX is supported. */ + +/* AVX Implementation */ + +ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z) +{ + avxi h = hash & 15; + avxf u = select(h < 8, x, y); + avxf vt = select((h == 12) | (h == 14), x, z); + avxf v = select(h < 4, y, vt); + return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1); +} + +ccl_device_inline avxf +grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w) +{ + avxi h = hash & 31; + avxf u = select(h < 24, x, y); + avxf v = select(h < 16, y, z); + avxf s = select(h < 8, z, w); + return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2); +} + +/* SSE Quadrilinear Interpolation: + * + * The interpolation is done in two steps: + * 1. Interpolate p and q along the w axis to get s. + * 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final + * value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the + * low and high ssef from s. + * + */ +ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f) +{ + ssef fv = shuffle<3>(f); + avxf s = mix(p, q, avxf(fv, fv)); + return tri_mix(low(s), high(s), f); +} + +/* We use AVX to compute and interpolate 8 gradients at once. + * + * Point Offset from v0 + * v0 (0, 0, 0) + * v1 (0, 0, 1) The full avx type is computed by inserting the following + * v2 (0, 1, 0) sse types into both the low and high parts of the avx. + * v3 (0, 1, 1) + * v4 (1, 0, 0) + * v5 (1, 0, 1) (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1)) + * v6 (1, 1, 0) ^ + * v7 (1, 1, 1) | + * | |__________| (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1) + * | ^ + * |__________________________| + * + */ +ccl_device_noinline float perlin_3d(float x, float y, float z) +{ + ssei XYZ; + ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ); + ssef uvw = fade(fxyz); + + ssei XYZ1 = XYZ + 1; + ssei X = shuffle<0>(XYZ); + ssei X1 = shuffle<0>(XYZ1); + ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1); + ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1)); + + avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z)); + + ssef fxyz1 = fxyz - 1.0f; + ssef fx = shuffle<0>(fxyz); + ssef fx1 = shuffle<0>(fxyz1); + ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1); + ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1)); + + avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz)); + + return extract<0>(tri_mix(low(g), high(g), uvw)); +} + +/* We use AVX to compute and interpolate 8 gradients at once. Since we have 16 + * gradients in 4D, we need to compute two sets of gradients at the points: + * + * Point Offset from v0 + * v0 (0, 0, 0, 0) + * v1 (0, 0, 1, 0) The full avx type is computed by inserting the following + * v2 (0, 1, 0, 0) sse types into both the low and high parts of the avx. + * v3 (0, 1, 1, 0) + * v4 (1, 0, 0, 0) + * v5 (1, 0, 1, 0) (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1)) + * v6 (1, 1, 0, 0) ^ + * v7 (1, 1, 1, 0) | + * | |________| (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1) + * | ^ + * |_______________________| + * + * Point Offset from v0 + * v8 (0, 0, 0, 1) + * v9 (0, 0, 1, 1) + * v10 (0, 1, 0, 1) + * v11 (0, 1, 1, 1) + * v12 (1, 0, 0, 1) + * v13 (1, 0, 1, 1) + * v14 (1, 1, 0, 1) + * v15 (1, 1, 1, 1) + * + */ +ccl_device_noinline float perlin_4d(float x, float y, float z, float w) +{ + ssei XYZW; + ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW); + ssef uvws = fade(fxyzw); + + ssei XYZW1 = XYZW + 1; + ssei X = shuffle<0>(XYZW); + ssei X1 = shuffle<0>(XYZW1); + ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1); + ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1)); + ssei W = shuffle<3>(XYZW); + ssei W1 = shuffle<3>(XYZW1); + + avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W)); + avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1)); + + ssef fxyzw1 = fxyzw - 1.0f; + ssef fx = shuffle<0>(fxyzw); + ssef fx1 = shuffle<0>(fxyzw1); + ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1); + ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1)); + ssef fw = shuffle<3>(fxyzw); + ssef fw1 = shuffle<3>(fxyzw1); + + avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw)); + avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1)); + + return extract<0>(quad_mix(g1, g2, uvws)); +} +# endif + +# undef negate_if_nth_bit + #endif /* Remap the output of noise to a predictable range [-1, 1]. diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index 2d89fb9ffba..22db8e875dc 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -165,6 +165,35 @@ bool RenderBuffers::copy_from_device() return true; } +static const float *get_sample_count_pass(const vector<Pass> &passes, device_vector<float> &buffer) +{ + int sample_offset = 0; + + for (const Pass &pass : passes) { + if (pass.type != PASS_SAMPLE_COUNT) { + sample_offset += pass.components; + } + else { + return buffer.data() + sample_offset; + } + } + + return NULL; +} + +static float get_pixel_pass_scale(const float rcp_sample, + const float *sample_count, + const int i, + const int pass_stride) +{ + if (sample_count) { + return 1.0f / fabsf(sample_count[i * pass_stride]); + } + else { + return rcp_sample; + } +} + bool RenderBuffers::get_denoising_pass_rect( int type, float exposure, int sample, int components, float *pixels) { @@ -260,22 +289,7 @@ bool RenderBuffers::get_pass_rect( return false; } - float *sample_count = NULL; - if (name == "Combined") { - int sample_offset = 0; - for (size_t j = 0; j < params.passes.size(); j++) { - Pass &pass = params.passes[j]; - if (pass.type != PASS_SAMPLE_COUNT) { - sample_offset += pass.components; - continue; - } - else { - sample_count = buffer.data() + sample_offset; - break; - } - } - } - + const float *sample_count = get_sample_count_pass(params.passes, buffer); int pass_offset = 0; for (size_t j = 0; j < params.passes.size(); j++) { @@ -293,8 +307,8 @@ bool RenderBuffers::get_pass_rect( float *in = buffer.data() + pass_offset; int pass_stride = params.get_passes_size(); - float scale = (pass.filter) ? 1.0f / (float)sample : 1.0f; - float scale_exposure = (pass.exposure) ? scale * exposure : scale; + const float rcp_sample = 1.0f / (float)sample; + const float pass_exposure = (pass.exposure) ? exposure : 1.0f; int size = params.width * params.height; @@ -312,28 +326,36 @@ bool RenderBuffers::get_pass_rect( if (type == PASS_DEPTH) { for (int i = 0; i < size; i++, in += pass_stride, pixels++) { float f = *in; - pixels[0] = (f == 0.0f) ? 1e10f : f * scale_exposure; + pixels[0] = (f == 0.0f) ? 1e10f : f; + } + } + else if (type == PASS_OBJECT_ID || type == PASS_MATERIAL_ID) { + for (int i = 0; i < size; i++, in += pass_stride, pixels++) { + pixels[0] = *in; } } else if (type == PASS_MIST) { for (int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = saturate(f * scale_exposure); + const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride); + const float f = *in; + pixels[0] = saturate(f * scale); } } #ifdef WITH_CYCLES_DEBUG else if (type == PASS_BVH_TRAVERSED_NODES || type == PASS_BVH_TRAVERSED_INSTANCES || type == PASS_BVH_INTERSECTIONS || type == PASS_RAY_BOUNCES) { for (int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; + const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride); + const float f = *in; pixels[0] = f * scale; } } #endif else { for (int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = f * scale_exposure; + const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride); + const float f = *in; + pixels[0] = f * scale * pass_exposure; } } } @@ -367,7 +389,7 @@ bool RenderBuffers::get_pass_rect( float3 f = make_float3(in[0], in[1], in[2]); float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]); - f = safe_divide_even_color(f * exposure, f_divide); + f = safe_divide_even_color(f * pass_exposure, f_divide); pixels[0] = f.x; pixels[1] = f.y; @@ -377,7 +399,9 @@ bool RenderBuffers::get_pass_rect( else { /* RGB/vector */ for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) { - float3 f = make_float3(in[0], in[1], in[2]); + const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride); + const float scale_exposure = scale * pass_exposure; + const float3 f = make_float3(in[0], in[1], in[2]); pixels[0] = f.x * scale_exposure; pixels[1] = f.y * scale_exposure; @@ -425,7 +449,9 @@ bool RenderBuffers::get_pass_rect( } else if (type == PASS_CRYPTOMATTE) { for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) { - float4 f = make_float4(in[0], in[1], in[2], in[3]); + const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride); + const float4 f = make_float4(in[0], in[1], in[2], in[3]); + /* x and z contain integer IDs, don't rescale them. y and w contain matte weights, they get scaled. */ pixels[0] = f.x; @@ -436,12 +462,9 @@ bool RenderBuffers::get_pass_rect( } else { for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) { - if (sample_count && sample_count[i * pass_stride] < 0.0f) { - scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f; - scale_exposure = (pass.exposure) ? scale * exposure : scale; - } - - float4 f = make_float4(in[0], in[1], in[2], in[3]); + const float scale = get_pixel_pass_scale(rcp_sample, sample_count, i, pass_stride); + const float scale_exposure = scale * pass_exposure; + const float4 f = make_float4(in[0], in[1], in[2], in[3]); pixels[0] = f.x * scale_exposure; pixels[1] = f.y * scale_exposure; diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index baf02901123..c29810d1494 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -76,7 +76,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name) Pass pass; pass.type = type; - pass.filter = true; pass.exposure = false; pass.divide_type = PASS_NONE; if (name) { @@ -93,7 +92,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name) break; case PASS_DEPTH: pass.components = 1; - pass.filter = false; break; case PASS_MIST: pass.components = 1; @@ -114,7 +112,6 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name) case PASS_OBJECT_ID: case PASS_MATERIAL_ID: pass.components = 1; - pass.filter = false; break; case PASS_EMISSION: diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index aae8fb404b0..0fe4fe89d5e 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -42,7 +42,6 @@ class Pass { public: PassType type; int components; - bool filter; bool exposure; PassType divide_type; string name; diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h index 54dd8068eca..34fafd188de 100644 --- a/intern/cycles/util/util_avxb.h +++ b/intern/cycles/util/util_avxb.h @@ -16,7 +16,7 @@ */ #ifndef __UTIL_AVXB_H__ -# define __UTIL_AVXB_H__ +#define __UTIL_AVXB_H__ CCL_NAMESPACE_BEGIN @@ -53,6 +53,10 @@ struct avxb { __forceinline avxb(const __m256 input) : m256(input) { } + __forceinline avxb(const __m128 &a, const __m128 &b) + : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1)) + { + } __forceinline operator const __m256 &(void)const { return m256; @@ -146,9 +150,9 @@ __forceinline const avxb operator!=(const avxb &a, const avxb &b) } __forceinline const avxb operator==(const avxb &a, const avxb &b) { -# ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX2__ return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); -# else +#else __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0)); __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1)); __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0)); @@ -157,16 +161,16 @@ __forceinline const avxb operator==(const avxb &a, const avxb &b) __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi); __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1); return _mm256_castsi256_ps(result); -# endif +#endif } __forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f) { -# if defined(__KERNEL_SSE41__) +#if defined(__KERNEL_SSE41__) return _mm256_blendv_ps(f, t, m); -# else +#else return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f)); -# endif +#endif } //////////////////////////////////////////////////////////////////////////////// @@ -186,18 +190,18 @@ __forceinline const avxb unpackhi(const avxb &a, const avxb &b) /// Reduction Operations //////////////////////////////////////////////////////////////////////////////// -# if defined(__KERNEL_SSE41__) +#if defined(__KERNEL_SSE41__) __forceinline size_t popcnt(const avxb &a) { return __popcnt(_mm256_movemask_ps(a)); } -# else +#else __forceinline size_t popcnt(const avxb &a) { return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) + bool(a[7]); } -# endif +#endif __forceinline bool reduce_and(const avxb &a) { @@ -234,8 +238,6 @@ ccl_device_inline void print_avxb(const char *label, const avxb &a) printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]); } -#endif - CCL_NAMESPACE_END -//#endif +#endif diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h index 156607e65fb..1fb3ded422f 100644 --- a/intern/cycles/util/util_avxf.h +++ b/intern/cycles/util/util_avxf.h @@ -15,7 +15,7 @@ */ #ifndef __UTIL_AVXF_H__ -# define __UTIL_AVXF_H__ +#define __UTIL_AVXF_H__ CCL_NAMESPACE_BEGIN @@ -140,6 +140,11 @@ __forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2) /// Unary Operators //////////////////////////////////////////////////////////////////////////////// +__forceinline const avxf cast(const __m256i &a) +{ + return _mm256_castsi256_ps(a); +} + __forceinline const avxf mm256_sqrt(const avxf &a) { return _mm256_sqrt_ps(a.m256); @@ -259,16 +264,35 @@ template<size_t i0> __forceinline const avxf shuffle(const avxf &a) return shuffle<i0>(a, a); } +template<size_t i> __forceinline float extract(const avxf &a) +{ + __m256 b = shuffle<i, i, i, i>(a).m256; + return _mm256_cvtss_f32(b); +} +template<> __forceinline float extract<0>(const avxf &a) +{ + return _mm256_cvtss_f32(a.m256); +} + +__forceinline ssef low(const avxf &a) +{ + return _mm256_extractf128_ps(a.m256, 0); +} +__forceinline ssef high(const avxf &a) +{ + return _mm256_extractf128_ps(a.m256, 1); +} + template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf permute(const avxf &a) { -# ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX2__ return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0)); -# else +#else float temp[8]; _mm256_storeu_ps((float *)&temp, a); return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]); -# endif +#endif } template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7> @@ -309,39 +333,51 @@ __forceinline avxf mini(const avxf &a, const avxf &b) //////////////////////////////////////////////////////////////////////////////// __forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c) { -# ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX2__ return _mm256_fmadd_ps(a, b, c); -# else +#else return c + (a * b); -# endif +#endif } __forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c) { -# ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX2__ return _mm256_fnmadd_ps(a, b, c); -# else +#else return c - (a * b); -# endif +#endif } __forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c) { -# ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX2__ return _mm256_fmsub_ps(a, b, c); -# else +#else return (a * b) - c; -# endif +#endif } //////////////////////////////////////////////////////////////////////////////// -/// Comparison Operators +/// Comparison Operators + Select //////////////////////////////////////////////////////////////////////////////// __forceinline const avxb operator<=(const avxf &a, const avxf &b) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS); } -#endif +__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f) +{ + return _mm256_blendv_ps(f, t, m); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Common Functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t) +{ + return madd(t, b, (avxf(1.0f) - t) * a); +} #ifndef _mm256_set_m128 # define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \ @@ -352,3 +388,5 @@ __forceinline const avxb operator<=(const avxf &a, const avxf &b) _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr)) CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/util/util_avxi.h b/intern/cycles/util/util_avxi.h new file mode 100644 index 00000000000..e658a4f848f --- /dev/null +++ b/intern/cycles/util/util_avxi.h @@ -0,0 +1,745 @@ +/* + * Copyright 2009-2013 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_AVXI_H__ +#define __UTIL_AVXI_H__ + +CCL_NAMESPACE_BEGIN + +struct avxb; + +struct avxi { + typedef avxb Mask; // mask type for us + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i m256; +#if !defined(__KERNEL_AVX2__) + struct { + __m128i l, h; + }; +#endif + int32_t v[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline avxi() + { + } + __forceinline avxi(const avxi &a) + { + m256 = a.m256; + } + __forceinline avxi &operator=(const avxi &a) + { + m256 = a.m256; + return *this; + } + + __forceinline avxi(const __m256i a) : m256(a) + { + } + __forceinline operator const __m256i &(void)const + { + return m256; + } + __forceinline operator __m256i &(void) + { + return m256; + } + + __forceinline explicit avxi(const ssei &a) + : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1)) + { + } + __forceinline avxi(const ssei &a, const ssei &b) + : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1)) + { + } +#if defined(__KERNEL_AVX2__) + __forceinline avxi(const __m128i &a, const __m128i &b) + : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1)) + { + } +#else + __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b) + { + } +#endif + __forceinline explicit avxi(const int32_t *const a) + : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a))) + { + } + __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a)) + { + } + __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a)) + { + } + __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d) + : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a)) + { + } + __forceinline avxi( + int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h) + : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a)) + { + } + + __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a)) + { + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256()) + { + } +#if defined(__KERNEL_AVX2__) + __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1)) + { + } + __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf)) + { + } + __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf)) + { + } +#else + __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1)) + { + } + __forceinline avxi(PosInfTy) + : m256(_mm256_set_epi32( + pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf)) + { + } + __forceinline avxi(NegInfTy) + : m256(_mm256_set_epi32( + neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf)) + { + } +#endif + __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) + { + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int32_t &operator[](const size_t i) const + { + assert(i < 8); + return v[i]; + } + __forceinline int32_t &operator[](const size_t i) + { + assert(i < 8); + return v[i]; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// Unary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxi cast(const __m256 &a) +{ + return _mm256_castps_si256(a); +} +__forceinline const avxi operator+(const avxi &a) +{ + return a; +} +#if defined(__KERNEL_AVX2__) +__forceinline const avxi operator-(const avxi &a) +{ + return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256); +} +__forceinline const avxi abs(const avxi &a) +{ + return _mm256_abs_epi32(a.m256); +} +#else +__forceinline const avxi operator-(const avxi &a) +{ + return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h)); +} +__forceinline const avxi abs(const avxi &a) +{ + return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h)); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Binary Operators +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__KERNEL_AVX2__) +__forceinline const avxi operator+(const avxi &a, const avxi &b) +{ + return _mm256_add_epi32(a.m256, b.m256); +} +#else +__forceinline const avxi operator+(const avxi &a, const avxi &b) +{ + return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h)); +} +#endif +__forceinline const avxi operator+(const avxi &a, const int32_t b) +{ + return a + avxi(b); +} +__forceinline const avxi operator+(const int32_t a, const avxi &b) +{ + return avxi(a) + b; +} + +#if defined(__KERNEL_AVX2__) +__forceinline const avxi operator-(const avxi &a, const avxi &b) +{ + return _mm256_sub_epi32(a.m256, b.m256); +} +#else +__forceinline const avxi operator-(const avxi &a, const avxi &b) +{ + return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h)); +} +#endif +__forceinline const avxi operator-(const avxi &a, const int32_t b) +{ + return a - avxi(b); +} +__forceinline const avxi operator-(const int32_t a, const avxi &b) +{ + return avxi(a) - b; +} + +#if defined(__KERNEL_AVX2__) +__forceinline const avxi operator*(const avxi &a, const avxi &b) +{ + return _mm256_mullo_epi32(a.m256, b.m256); +} +#else +__forceinline const avxi operator*(const avxi &a, const avxi &b) +{ + return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h)); +} +#endif +__forceinline const avxi operator*(const avxi &a, const int32_t b) +{ + return a * avxi(b); +} +__forceinline const avxi operator*(const int32_t a, const avxi &b) +{ + return avxi(a) * b; +} + +#if defined(__KERNEL_AVX2__) +__forceinline const avxi operator&(const avxi &a, const avxi &b) +{ + return _mm256_and_si256(a.m256, b.m256); +} +#else +__forceinline const avxi operator&(const avxi &a, const avxi &b) +{ + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); +} +#endif +__forceinline const avxi operator&(const avxi &a, const int32_t b) +{ + return a & avxi(b); +} +__forceinline const avxi operator&(const int32_t a, const avxi &b) +{ + return avxi(a) & b; +} + +#if defined(__KERNEL_AVX2__) +__forceinline const avxi operator|(const avxi &a, const avxi &b) +{ + return _mm256_or_si256(a.m256, b.m256); +} +#else +__forceinline const avxi operator|(const avxi &a, const avxi &b) +{ + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); +} +#endif +__forceinline const avxi operator|(const avxi &a, const int32_t b) +{ + return a | avxi(b); +} +__forceinline const avxi operator|(const int32_t a, const avxi &b) +{ + return avxi(a) | b; +} + +#if defined(__KERNEL_AVX2__) +__forceinline const avxi operator^(const avxi &a, const avxi &b) +{ + return _mm256_xor_si256(a.m256, b.m256); +} +#else +__forceinline const avxi operator^(const avxi &a, const avxi &b) +{ + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); +} +#endif +__forceinline const avxi operator^(const avxi &a, const int32_t b) +{ + return a ^ avxi(b); +} +__forceinline const avxi operator^(const int32_t a, const avxi &b) +{ + return avxi(a) ^ b; +} + +#if defined(__KERNEL_AVX2__) +__forceinline const avxi operator<<(const avxi &a, const int32_t n) +{ + return _mm256_slli_epi32(a.m256, n); +} +__forceinline const avxi operator>>(const avxi &a, const int32_t n) +{ + return _mm256_srai_epi32(a.m256, n); +} + +__forceinline const avxi sra(const avxi &a, const int32_t b) +{ + return _mm256_srai_epi32(a.m256, b); +} +__forceinline const avxi srl(const avxi &a, const int32_t b) +{ + return _mm256_srli_epi32(a.m256, b); +} +#else +__forceinline const avxi operator<<(const avxi &a, const int32_t n) +{ + return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n)); +} +__forceinline const avxi operator>>(const avxi &a, const int32_t n) +{ + return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n)); +} + +__forceinline const avxi sra(const avxi &a, const int32_t b) +{ + return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b)); +} +__forceinline const avxi srl(const avxi &a, const int32_t b) +{ + return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b)); +} +#endif + +#if defined(__KERNEL_AVX2__) +__forceinline const avxi min(const avxi &a, const avxi &b) +{ + return _mm256_min_epi32(a.m256, b.m256); +} +#else +__forceinline const avxi min(const avxi &a, const avxi &b) +{ + return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h)); +} +#endif +__forceinline const avxi min(const avxi &a, const int32_t b) +{ + return min(a, avxi(b)); +} +__forceinline const avxi min(const int32_t a, const avxi &b) +{ + return min(avxi(a), b); +} + +#if defined(__KERNEL_AVX2__) +__forceinline const avxi max(const avxi &a, const avxi &b) +{ + return _mm256_max_epi32(a.m256, b.m256); +} +#else +__forceinline const avxi max(const avxi &a, const avxi &b) +{ + return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h)); +} +#endif +__forceinline const avxi max(const avxi &a, const int32_t b) +{ + return max(a, avxi(b)); +} +__forceinline const avxi max(const int32_t a, const avxi &b) +{ + return max(avxi(a), b); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Assignment Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline avxi &operator+=(avxi &a, const avxi &b) +{ + return a = a + b; +} +__forceinline avxi &operator+=(avxi &a, const int32_t b) +{ + return a = a + b; +} + +__forceinline avxi &operator-=(avxi &a, const avxi &b) +{ + return a = a - b; +} +__forceinline avxi &operator-=(avxi &a, const int32_t b) +{ + return a = a - b; +} + +__forceinline avxi &operator*=(avxi &a, const avxi &b) +{ + return a = a * b; +} +__forceinline avxi &operator*=(avxi &a, const int32_t b) +{ + return a = a * b; +} + +__forceinline avxi &operator&=(avxi &a, const avxi &b) +{ + return a = a & b; +} +__forceinline avxi &operator&=(avxi &a, const int32_t b) +{ + return a = a & b; +} + +__forceinline avxi &operator|=(avxi &a, const avxi &b) +{ + return a = a | b; +} +__forceinline avxi &operator|=(avxi &a, const int32_t b) +{ + return a = a | b; +} + +__forceinline avxi &operator^=(avxi &a, const avxi &b) +{ + return a = a ^ b; +} +__forceinline avxi &operator^=(avxi &a, const int32_t b) +{ + return a = a ^ b; +} + +__forceinline avxi &operator<<=(avxi &a, const int32_t b) +{ + return a = a << b; +} +__forceinline avxi &operator>>=(avxi &a, const int32_t b) +{ + return a = a >> b; +} + +//////////////////////////////////////////////////////////////////////////////// +/// Comparison Operators + Select +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__KERNEL_AVX2__) +__forceinline const avxb operator==(const avxi &a, const avxi &b) +{ + return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256)); +} +#else +__forceinline const avxb operator==(const avxi &a, const avxi &b) +{ + return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)), + _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h))); +} +#endif +__forceinline const avxb operator==(const avxi &a, const int32_t b) +{ + return a == avxi(b); +} +__forceinline const avxb operator==(const int32_t a, const avxi &b) +{ + return avxi(a) == b; +} + +__forceinline const avxb operator!=(const avxi &a, const avxi &b) +{ + return !(a == b); +} +__forceinline const avxb operator!=(const avxi &a, const int32_t b) +{ + return a != avxi(b); +} +__forceinline const avxb operator!=(const int32_t a, const avxi &b) +{ + return avxi(a) != b; +} + +#if defined(__KERNEL_AVX2__) +__forceinline const avxb operator<(const avxi &a, const avxi &b) +{ + return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256)); +} +#else +__forceinline const avxb operator<(const avxi &a, const avxi &b) +{ + return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)), + _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h))); +} +#endif +__forceinline const avxb operator<(const avxi &a, const int32_t b) +{ + return a < avxi(b); +} +__forceinline const avxb operator<(const int32_t a, const avxi &b) +{ + return avxi(a) < b; +} + +__forceinline const avxb operator>=(const avxi &a, const avxi &b) +{ + return !(a < b); +} +__forceinline const avxb operator>=(const avxi &a, const int32_t b) +{ + return a >= avxi(b); +} +__forceinline const avxb operator>=(const int32_t a, const avxi &b) +{ + return avxi(a) >= b; +} + +#if defined(__KERNEL_AVX2__) +__forceinline const avxb operator>(const avxi &a, const avxi &b) +{ + return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256)); +} +#else +__forceinline const avxb operator>(const avxi &a, const avxi &b) +{ + return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)), + _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h))); +} +#endif +__forceinline const avxb operator>(const avxi &a, const int32_t b) +{ + return a > avxi(b); +} +__forceinline const avxb operator>(const int32_t a, const avxi &b) +{ + return avxi(a) > b; +} + +__forceinline const avxb operator<=(const avxi &a, const avxi &b) +{ + return !(a > b); +} +__forceinline const avxb operator<=(const avxi &a, const int32_t b) +{ + return a <= avxi(b); +} +__forceinline const avxb operator<=(const int32_t a, const avxi &b) +{ + return avxi(a) <= b; +} + +__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f) +{ + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Movement/Shifting/Shuffling Functions +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__KERNEL_AVX2__) +__forceinline avxi unpacklo(const avxi &a, const avxi &b) +{ + return _mm256_unpacklo_epi32(a.m256, b.m256); +} +__forceinline avxi unpackhi(const avxi &a, const avxi &b) +{ + return _mm256_unpackhi_epi32(a.m256, b.m256); +} +#else +__forceinline avxi unpacklo(const avxi &a, const avxi &b) +{ + return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); +} +__forceinline avxi unpackhi(const avxi &a, const avxi &b) +{ + return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); +} +#endif + +template<size_t i> __forceinline const avxi shuffle(const avxi &a) +{ + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i))); +} + +template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a) +{ + return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0)); +} + +template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b) +{ + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> +__forceinline const avxi shuffle(const avxi &a) +{ + return _mm256_castps_si256( + _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0))); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> +__forceinline const avxi shuffle(const avxi &a, const avxi &b) +{ + return _mm256_castps_si256(_mm256_shuffle_ps( + _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); +} + +template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b) +{ + return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b))); +} +template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b) +{ + return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b))); +} +template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b) +{ + return _mm256_castps_si256( + _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b))))); +} + +__forceinline const avxi broadcast(const int *ptr) +{ + return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr)); +} +template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b) +{ + return _mm256_insertf128_si256(a, b, i); +} +template<size_t i> __forceinline const ssei extract(const avxi &a) +{ + return _mm256_extractf128_si256(a, i); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Reductions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const avxi vreduce_min2(const avxi &v) +{ + return min(v, shuffle<1, 0, 3, 2>(v)); +} +__forceinline const avxi vreduce_min4(const avxi &v) +{ + avxi v1 = vreduce_min2(v); + return min(v1, shuffle<2, 3, 0, 1>(v1)); +} +__forceinline const avxi vreduce_min(const avxi &v) +{ + avxi v1 = vreduce_min4(v); + return min(v1, shuffle<1, 0>(v1)); +} + +__forceinline const avxi vreduce_max2(const avxi &v) +{ + return max(v, shuffle<1, 0, 3, 2>(v)); +} +__forceinline const avxi vreduce_max4(const avxi &v) +{ + avxi v1 = vreduce_max2(v); + return max(v1, shuffle<2, 3, 0, 1>(v1)); +} +__forceinline const avxi vreduce_max(const avxi &v) +{ + avxi v1 = vreduce_max4(v); + return max(v1, shuffle<1, 0>(v1)); +} + +__forceinline const avxi vreduce_add2(const avxi &v) +{ + return v + shuffle<1, 0, 3, 2>(v); +} +__forceinline const avxi vreduce_add4(const avxi &v) +{ + avxi v1 = vreduce_add2(v); + return v1 + shuffle<2, 3, 0, 1>(v1); +} +__forceinline const avxi vreduce_add(const avxi &v) +{ + avxi v1 = vreduce_add4(v); + return v1 + shuffle<1, 0>(v1); +} + +__forceinline int reduce_min(const avxi &v) +{ + return extract<0>(extract<0>(vreduce_min(v))); +} +__forceinline int reduce_max(const avxi &v) +{ + return extract<0>(extract<0>(vreduce_max(v))); +} +__forceinline int reduce_add(const avxi &v) +{ + return extract<0>(extract<0>(vreduce_add(v))); +} + +__forceinline size_t select_min(const avxi &v) +{ + return __bsf(movemask(v == vreduce_min(v))); +} +__forceinline size_t select_max(const avxi &v) +{ + return __bsf(movemask(v == vreduce_max(v))); +} + +__forceinline size_t select_min(const avxb &valid, const avxi &v) +{ + const avxi a = select(valid, v, avxi(pos_inf)); + return __bsf(movemask(valid & (a == vreduce_min(a)))); +} +__forceinline size_t select_max(const avxb &valid, const avxi &v) +{ + const avxi a = select(valid, v, avxi(neg_inf)); + return __bsf(movemask(valid & (a == vreduce_max(a)))); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Output Operators +//////////////////////////////////////////////////////////////////////////////// + +ccl_device_inline void print_avxi(const char *label, const avxi &a) +{ + printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h index ca48758efcd..0021eec169b 100644 --- a/intern/cycles/util/util_hash.h +++ b/intern/cycles/util/util_hash.h @@ -312,6 +312,60 @@ ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw) return c; } +# if defined(__KERNEL_AVX__) +ccl_device_inline avxi hash_avxi(avxi kx) +{ + avxi a, b, c; + a = b = c = avxi(0xdeadbeef + (1 << 2) + 13); + + a += kx; + final(a, b, c); + + return c; +} + +ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky) +{ + avxi a, b, c; + a = b = c = avxi(0xdeadbeef + (2 << 2) + 13); + + b += ky; + a += kx; + final(a, b, c); + + return c; +} + +ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz) +{ + avxi a, b, c; + a = b = c = avxi(0xdeadbeef + (3 << 2) + 13); + + c += kz; + b += ky; + a += kx; + final(a, b, c); + + return c; +} + +ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw) +{ + avxi a, b, c; + a = b = c = avxi(0xdeadbeef + (4 << 2) + 13); + + a += kx; + b += ky; + c += kz; + mix(a, b, c); + + a += kw; + final(a, b, c); + + return c; +} +# endif + # undef rot # undef final # undef mix diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index f49cfb4184d..de0e3c39f30 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -75,6 +75,28 @@ static struct FalseTy { } } False ccl_maybe_unused; +static struct ZeroTy { + __forceinline operator float() const + { + return 0; + } + __forceinline operator int() const + { + return 0; + } +} zero ccl_maybe_unused; + +static struct OneTy { + __forceinline operator float() const + { + return 1; + } + __forceinline operator int() const + { + return 1; + } +} one ccl_maybe_unused; + static struct NegInfTy { __forceinline operator float() const { @@ -97,6 +119,9 @@ static struct PosInfTy { } } inf ccl_maybe_unused, pos_inf ccl_maybe_unused; +static struct StepTy { +} step ccl_maybe_unused; + /* Intrinsics Functions */ # if defined(__BMI__) && defined(__GNUC__) @@ -563,6 +588,13 @@ __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags) # endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ +/* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves. + * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */ +# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) +# undef _mm256_cvtss_f32 +# define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a))) +# endif + # else /* __KERNEL_SSE2__ */ /* This section is for utility functions which operates on non-register data diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index f6535848480..a721595667d 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -158,6 +158,7 @@ CCL_NAMESPACE_END # if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) # include "util/util_avxb.h" # include "util/util_avxf.h" +# include "util/util_avxi.h" # endif #endif |