diff options
Diffstat (limited to 'intern/cycles/kernel/device')
31 files changed, 4331 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h new file mode 100644 index 00000000000..888c0d5d872 --- /dev/null +++ b/intern/cycles/kernel/device/cpu/compat.h @@ -0,0 +1,99 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define __KERNEL_CPU__ + +/* Release kernel has too much false-positive maybe-uninitialized warnings, + * which makes it possible to miss actual warnings. + */ +#if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG) +# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +# pragma GCC diagnostic ignored "-Wuninitialized" +#endif + +#include "util/util_half.h" +#include "util/util_math.h" +#include "util/util_simd.h" +#include "util/util_texture.h" +#include "util/util_types.h" + +/* On x86_64, versions of glibc < 2.16 have an issue where expf is + * much slower than the double version. This was fixed in glibc 2.16. + */ +#if !defined(__KERNEL_GPU__) && defined(__x86_64__) && defined(__x86_64__) && \ + defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \ + (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16) +# define expf(x) ((float)exp((double)(x))) +#endif + +CCL_NAMESPACE_BEGIN + +/* Assertions inside the kernel only work for the CPU device, so we wrap it in + * a macro which is empty for other devices */ + +#define kernel_assert(cond) assert(cond) + +/* Texture types to be compatible with CUDA textures. These are really just + * simple arrays and after inlining fetch hopefully revert to being a simple + * pointer lookup. */ +template<typename T> struct texture { + ccl_always_inline const T &fetch(int index) const + { + kernel_assert(index >= 0 && index < width); + return data[index]; + } + + T *data; + int width; +}; + +/* Macros to handle different memory storage on different devices */ + +#ifdef __KERNEL_SSE2__ +typedef vector3<sseb> sse3b; +typedef vector3<ssef> sse3f; +typedef vector3<ssei> sse3i; + +ccl_device_inline void print_sse3b(const char *label, sse3b &a) +{ + print_sseb(label, a.x); + print_sseb(label, a.y); + print_sseb(label, a.z); +} + +ccl_device_inline void print_sse3f(const char *label, sse3f &a) +{ + print_ssef(label, a.x); + print_ssef(label, a.y); + print_ssef(label, a.z); +} + +ccl_device_inline void print_sse3i(const char *label, sse3i &a) +{ + print_ssei(label, a.x); + print_ssei(label, a.y); + print_ssei(label, a.z); +} + +# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) +typedef vector3<avxf> avx3f; +# endif + +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h new file mode 100644 index 00000000000..fb9aae38cfc --- /dev/null +++ b/intern/cycles/kernel/device/cpu/globals.h @@ -0,0 +1,63 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#pragma once + +#include "kernel/kernel_profiling.h" +#include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in + * the kernel, to access constant data. These are all stored as "textures", but + * these are really just standard arrays. We can't use actually globals because + * multiple renders may be running inside the same process. */ + +#ifdef __OSL__ +struct OSLGlobals; +struct OSLThreadData; +struct OSLShadingSystem; +#endif + +typedef struct KernelGlobalsCPU { +#define KERNEL_TEX(type, name) texture<type> name; +#include "kernel/kernel_textures.h" + + KernelData __data; + +#ifdef __OSL__ + /* On the CPU, we also have the OSL globals here. Most data structures are shared + * with SVM, the difference is in the shaders and object/mesh attributes. */ + OSLGlobals *osl; + OSLShadingSystem *osl_ss; + OSLThreadData *osl_tdata; +#endif + + /* **** Run-time data **** */ + + ProfilingState profiler; +} KernelGlobalsCPU; + +typedef const KernelGlobalsCPU *ccl_restrict KernelGlobals; + +/* Abstraction macros */ +#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) +#define kernel_tex_array(tex) (kg->tex.data) +#define kernel_data (kg->__data) + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h new file mode 100644 index 00000000000..93f956e354d --- /dev/null +++ b/intern/cycles/kernel/device/cpu/image.h @@ -0,0 +1,657 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_NANOVDB +# define NANOVDB_USE_INTRINSICS +# include <nanovdb/NanoVDB.h> +# include <nanovdb/util/SampleFromVoxels.h> +#endif + +CCL_NAMESPACE_BEGIN + +/* Make template functions private so symbols don't conflict between kernels with different + * instruction sets. */ +namespace { + +#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \ + u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \ + u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } \ + (void)0 + +ccl_device_inline float frac(float x, int *ix) +{ + int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0); + *ix = i; + return x - (float)i; +} + +template<typename T> struct TextureInterpolator { + + static ccl_always_inline float4 read(float4 r) + { + return r; + } + + static ccl_always_inline float4 read(uchar4 r) + { + float f = 1.0f / 255.0f; + return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); + } + + static ccl_always_inline float4 read(uchar r) + { + float f = r * (1.0f / 255.0f); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline float4 read(float r) + { + /* TODO(dingto): Optimize this, so interpolation + * happens on float instead of float4 */ + return make_float4(r, r, r, 1.0f); + } + + static ccl_always_inline float4 read(half4 r) + { + return half4_to_float4_image(r); + } + + static ccl_always_inline float4 read(half r) + { + float f = half_to_float_image(r); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline float4 read(uint16_t r) + { + float f = r * (1.0f / 65535.0f); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline float4 read(ushort4 r) + { + float f = 1.0f / 65535.0f; + return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); + } + + static ccl_always_inline float4 read(const T *data, int x, int y, int width, int height) + { + if (x < 0 || y < 0 || x >= width || y >= height) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return read(data[y * width + x]); + } + + static ccl_always_inline int wrap_periodic(int x, int width) + { + x %= width; + if (x < 0) + x += width; + return x; + } + + static ccl_always_inline int wrap_clamp(int x, int width) + { + return clamp(x, 0, width - 1); + } + + /* ******** 2D interpolation ******** */ + + static ccl_always_inline float4 interp_closest(const TextureInfo &info, float x, float y) + { + const T *data = (const T *)info.data; + const int width = info.width; + const int height = info.height; + int ix, iy; + frac(x * (float)width, &ix); + frac(y * (float)height, &iy); + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + break; + case EXTENSION_CLIP: + if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return read(data[ix + iy * width]); + } + + static ccl_always_inline float4 interp_linear(const TextureInfo &info, float x, float y) + { + const T *data = (const T *)info.data; + const int width = info.width; + const int height = info.height; + int ix, iy, nix, niy; + const float tx = frac(x * (float)width - 0.5f, &ix); + const float ty = frac(y * (float)height - 0.5f, &iy); + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + nix = wrap_periodic(ix + 1, width); + niy = wrap_periodic(iy + 1, height); + break; + case EXTENSION_CLIP: + nix = ix + 1; + niy = iy + 1; + break; + case EXTENSION_EXTEND: + nix = wrap_clamp(ix + 1, width); + niy = wrap_clamp(iy + 1, height); + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) + + (1.0f - ty) * tx * read(data, nix, iy, width, height) + + ty * (1.0f - tx) * read(data, ix, niy, width, height) + + ty * tx * read(data, nix, niy, width, height); + } + + static ccl_always_inline float4 interp_cubic(const TextureInfo &info, float x, float y) + { + const T *data = (const T *)info.data; + const int width = info.width; + const int height = info.height; + int ix, iy, nix, niy; + const float tx = frac(x * (float)width - 0.5f, &ix); + const float ty = frac(y * (float)height - 0.5f, &iy); + int pix, piy, nnix, nniy; + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + pix = wrap_periodic(ix - 1, width); + piy = wrap_periodic(iy - 1, height); + nix = wrap_periodic(ix + 1, width); + niy = wrap_periodic(iy + 1, height); + nnix = wrap_periodic(ix + 2, width); + nniy = wrap_periodic(iy + 2, height); + break; + case EXTENSION_CLIP: + pix = ix - 1; + piy = iy - 1; + nix = ix + 1; + niy = iy + 1; + nnix = ix + 2; + nniy = iy + 2; + break; + case EXTENSION_EXTEND: + pix = wrap_clamp(ix - 1, width); + piy = wrap_clamp(iy - 1, height); + nix = wrap_clamp(ix + 1, width); + niy = wrap_clamp(iy + 1, height); + nnix = wrap_clamp(ix + 2, width); + nniy = wrap_clamp(iy + 2, height); + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {piy, iy, niy, nniy}; + float u[4], v[4]; + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y) (read(data, xc[x], yc[y], width, height)) +#define TERM(col) \ + (v[col] * \ + (u[0] * DATA(0, col) + u[1] * DATA(1, col) + u[2] * DATA(2, col) + u[3] * DATA(3, col))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + /* Actual interpolation. */ + return TERM(0) + TERM(1) + TERM(2) + TERM(3); +#undef TERM +#undef DATA + } + + static ccl_always_inline float4 interp(const TextureInfo &info, float x, float y) + { + if (UNLIKELY(!info.data)) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + switch (info.interpolation) { + case INTERPOLATION_CLOSEST: + return interp_closest(info, x, y); + case INTERPOLATION_LINEAR: + return interp_linear(info, x, y); + default: + return interp_cubic(info, x, y); + } + } + + /* ******** 3D interpolation ******** */ + + static ccl_always_inline float4 interp_3d_closest(const TextureInfo &info, + float x, + float y, + float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + + frac(x * (float)width, &ix); + frac(y * (float)height, &iy); + frac(z * (float)depth, &iz); + + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + break; + case EXTENSION_CLIP: + if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const T *data = (const T *)info.data; + return read(data[ix + iy * width + iz * width * height]); + } + + static ccl_always_inline float4 interp_3d_linear(const TextureInfo &info, + float x, + float y, + float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + int nix, niy, niz; + + float tx = frac(x * (float)width - 0.5f, &ix); + float ty = frac(y * (float)height - 0.5f, &iy); + float tz = frac(z * (float)depth - 0.5f, &iz); + + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + nix = wrap_periodic(ix + 1, width); + niy = wrap_periodic(iy + 1, height); + niz = wrap_periodic(iz + 1, depth); + break; + case EXTENSION_CLIP: + if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + nix = wrap_clamp(ix + 1, width); + niy = wrap_clamp(iy + 1, height); + niz = wrap_clamp(iz + 1, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const T *data = (const T *)info.data; + float4 r; + + r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * + read(data[ix + iy * width + iz * width * height]); + r += (1.0f - tz) * (1.0f - ty) * tx * read(data[nix + iy * width + iz * width * height]); + r += (1.0f - tz) * ty * (1.0f - tx) * read(data[ix + niy * width + iz * width * height]); + r += (1.0f - tz) * ty * tx * read(data[nix + niy * width + iz * width * height]); + + r += tz * (1.0f - ty) * (1.0f - tx) * read(data[ix + iy * width + niz * width * height]); + r += tz * (1.0f - ty) * tx * read(data[nix + iy * width + niz * width * height]); + r += tz * ty * (1.0f - tx) * read(data[ix + niy * width + niz * width * height]); + r += tz * ty * tx * read(data[nix + niy * width + niz * width * height]); + + return r; + } + + /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are + * causing stack overflow issue in this function unless it is inlined. + * + * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization + * enabled. + */ +#if defined(__GNUC__) || defined(__clang__) + static ccl_always_inline +#else + static ccl_never_inline +#endif + float4 + interp_3d_cubic(const TextureInfo &info, float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + int nix, niy, niz; + /* Tricubic b-spline interpolation. */ + const float tx = frac(x * (float)width - 0.5f, &ix); + const float ty = frac(y * (float)height - 0.5f, &iy); + const float tz = frac(z * (float)depth - 0.5f, &iz); + int pix, piy, piz, nnix, nniy, nniz; + + switch (info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + pix = wrap_periodic(ix - 1, width); + piy = wrap_periodic(iy - 1, height); + piz = wrap_periodic(iz - 1, depth); + + nix = wrap_periodic(ix + 1, width); + niy = wrap_periodic(iy + 1, height); + niz = wrap_periodic(iz + 1, depth); + + nnix = wrap_periodic(ix + 2, width); + nniy = wrap_periodic(iy + 2, height); + nniz = wrap_periodic(iz + 2, depth); + break; + case EXTENSION_CLIP: + if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + pix = wrap_clamp(ix - 1, width); + piy = wrap_clamp(iy - 1, height); + piz = wrap_clamp(iz - 1, depth); + + nix = wrap_clamp(ix + 1, width); + niy = wrap_clamp(iy + 1, height); + niz = wrap_clamp(iz + 1, depth); + + nnix = wrap_clamp(ix + 2, width); + nniy = wrap_clamp(iy + 2, height); + nniz = wrap_clamp(iz + 2, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, width * iy, width * niy, width * nniy}; + const int zc[4] = { + width * height * piz, width * height * iz, width * height * niz, width * height * nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) +#define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +#define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + const T *data = (const T *)info.data; + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +#undef COL_TERM +#undef ROW_TERM +#undef DATA + } + + static ccl_always_inline float4 + interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp) + { + if (UNLIKELY(!info.data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) { + case INTERPOLATION_CLOSEST: + return interp_3d_closest(info, x, y, z); + case INTERPOLATION_LINEAR: + return interp_3d_linear(info, x, y, z); + default: + return interp_3d_cubic(info, x, y, z); + } + } +}; + +#ifdef WITH_NANOVDB +template<typename T> struct NanoVDBInterpolator { + + typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType; + + static ccl_always_inline float4 read(float r) + { + return make_float4(r, r, r, 1.0f); + } + + static ccl_always_inline float4 read(nanovdb::Vec3f r) + { + return make_float4(r[0], r[1], r[2], 1.0f); + } + + static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc, + float x, + float y, + float z) + { + const nanovdb::Vec3f xyz(x, y, z); + return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz)); + } + + static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc, + float x, + float y, + float z) + { + const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f); + return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz)); + } + +# if defined(__GNUC__) || defined(__clang__) + static ccl_always_inline +# else + static ccl_never_inline +# endif + float4 + interp_3d_cubic(const AccessorType &acc, float x, float y, float z) + { + int ix, iy, iz; + int nix, niy, niz; + int pix, piy, piz; + int nnix, nniy, nniz; + /* Tricubic b-spline interpolation. */ + const float tx = frac(x - 0.5f, &ix); + const float ty = frac(y - 0.5f, &iy); + const float tz = frac(z - 0.5f, &iz); + pix = ix - 1; + piy = iy - 1; + piz = iz - 1; + nix = ix + 1; + niy = iy + 1; + niz = iz + 1; + nnix = ix + 2; + nniy = iy + 2; + nniz = iz + 2; + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {piy, iy, niy, nniy}; + const int zc[4] = {piz, iz, niz, nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +# define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z])))) +# define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +# define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +# undef COL_TERM +# undef ROW_TERM +# undef DATA + } + + static ccl_always_inline float4 + interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp) + { + using namespace nanovdb; + + NanoGrid<T> *const grid = (NanoGrid<T> *)info.data; + AccessorType acc = grid->getAccessor(); + + switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) { + case INTERPOLATION_CLOSEST: + return interp_3d_closest(acc, x, y, z); + case INTERPOLATION_LINEAR: + return interp_3d_linear(acc, x, y, z); + default: + return interp_3d_cubic(acc, x, y, z); + } + } +}; +#endif + +#undef SET_CUBIC_SPLINE_WEIGHTS + +ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y) +{ + const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + + switch (info.data_type) { + case IMAGE_DATA_TYPE_HALF: + return TextureInterpolator<half>::interp(info, x, y); + case IMAGE_DATA_TYPE_BYTE: + return TextureInterpolator<uchar>::interp(info, x, y); + case IMAGE_DATA_TYPE_USHORT: + return TextureInterpolator<uint16_t>::interp(info, x, y); + case IMAGE_DATA_TYPE_FLOAT: + return TextureInterpolator<float>::interp(info, x, y); + case IMAGE_DATA_TYPE_HALF4: + return TextureInterpolator<half4>::interp(info, x, y); + case IMAGE_DATA_TYPE_BYTE4: + return TextureInterpolator<uchar4>::interp(info, x, y); + case IMAGE_DATA_TYPE_USHORT4: + return TextureInterpolator<ushort4>::interp(info, x, y); + case IMAGE_DATA_TYPE_FLOAT4: + return TextureInterpolator<float4>::interp(info, x, y); + default: + assert(0); + return make_float4( + TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A); + } +} + +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg, + int id, + float3 P, + InterpolationType interp) +{ + const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + + if (info.use_transform_3d) { + P = transform_point(&info.transform_3d, P); + } + + switch (info.data_type) { + case IMAGE_DATA_TYPE_HALF: + return TextureInterpolator<half>::interp_3d(info, P.x, P.y, P.z, interp); + case IMAGE_DATA_TYPE_BYTE: + return TextureInterpolator<uchar>::interp_3d(info, P.x, P.y, P.z, interp); + case IMAGE_DATA_TYPE_USHORT: + return TextureInterpolator<uint16_t>::interp_3d(info, P.x, P.y, P.z, interp); + case IMAGE_DATA_TYPE_FLOAT: + return TextureInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp); + case IMAGE_DATA_TYPE_HALF4: + return TextureInterpolator<half4>::interp_3d(info, P.x, P.y, P.z, interp); + case IMAGE_DATA_TYPE_BYTE4: + return TextureInterpolator<uchar4>::interp_3d(info, P.x, P.y, P.z, interp); + case IMAGE_DATA_TYPE_USHORT4: + return TextureInterpolator<ushort4>::interp_3d(info, P.x, P.y, P.z, interp); + case IMAGE_DATA_TYPE_FLOAT4: + return TextureInterpolator<float4>::interp_3d(info, P.x, P.y, P.z, interp); +#ifdef WITH_NANOVDB + case IMAGE_DATA_TYPE_NANOVDB_FLOAT: + return NanoVDBInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp); + case IMAGE_DATA_TYPE_NANOVDB_FLOAT3: + return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, P.x, P.y, P.z, interp); +#endif + default: + assert(0); + return make_float4( + TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A); + } +} + +} /* Namespace. */ + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp new file mode 100644 index 00000000000..8519b77aa08 --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel.cpp @@ -0,0 +1,94 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) +/* do nothing */ +#endif + +#include "kernel/device/cpu/kernel.h" +#define KERNEL_ARCH cpu +#include "kernel/device/cpu/kernel_arch_impl.h" + +CCL_NAMESPACE_BEGIN + +/* Memory Copy */ + +void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t) +{ + if (strcmp(name, "__data") == 0) { + kg->__data = *(KernelData *)host; + } + else { + assert(0); + } +} + +void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size) +{ + if (0) { + } + +#define KERNEL_TEX(type, tname) \ + else if (strcmp(name, #tname) == 0) \ + { \ + kg->tname.data = (type *)mem; \ + kg->tname.width = size; \ + } +#include "kernel/kernel_textures.h" + else { + assert(0); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h new file mode 100644 index 00000000000..28337a58898 --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel.h @@ -0,0 +1,62 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +/* CPU Kernel Interface */ + +#include "util/util_types.h" + +#include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z +#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) +#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) + +struct IntegratorStateCPU; +struct KernelGlobalsCPU; +struct KernelData; + +KernelGlobalsCPU *kernel_globals_create(); +void kernel_globals_free(KernelGlobalsCPU *kg); + +void *kernel_osl_memory(const KernelGlobalsCPU *kg); +bool kernel_osl_use(const KernelGlobalsCPU *kg); + +void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t size); +void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size); + +#define KERNEL_ARCH cpu +#include "kernel/device/cpu/kernel_arch.h" + +#define KERNEL_ARCH cpu_sse2 +#include "kernel/device/cpu/kernel_arch.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/device/cpu/kernel_arch.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/device/cpu/kernel_arch.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/device/cpu/kernel_arch.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/device/cpu/kernel_arch.h" + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h new file mode 100644 index 00000000000..432ac5e15a9 --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel_arch.h @@ -0,0 +1,110 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common declaration part of all CPU kernels. */ + +/* -------------------------------------------------------------------- + * Integrator. + */ + +#define KERNEL_INTEGRATOR_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \ + IntegratorStateCPU *state) + +#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \ + IntegratorStateCPU *state, \ + ccl_global float *render_buffer) + +#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \ + bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \ + IntegratorStateCPU *state, \ + KernelWorkTile *tile, \ + ccl_global float *render_buffer) + +KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera); +KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake); +KERNEL_INTEGRATOR_FUNCTION(intersect_closest); +KERNEL_INTEGRATOR_FUNCTION(intersect_shadow); +KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface); +KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack); +KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background); +KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light); +KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow); +KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface); +KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume); +KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel); + +#undef KERNEL_INTEGRATOR_FUNCTION +#undef KERNEL_INTEGRATOR_INIT_FUNCTION +#undef KERNEL_INTEGRATOR_SHADE_FUNCTION + +/* -------------------------------------------------------------------- + * Shader evaluation. + */ + +void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg, + const KernelShaderEvalInput *input, + float *output, + const int offset); +void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg, + const KernelShaderEvalInput *input, + float *output, + const int offset); +void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)( + const KernelGlobalsCPU *kg, + const KernelShaderEvalInput *input, + float *output, + const int offset); + +/* -------------------------------------------------------------------- + * Adaptive sampling. + */ + +bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)( + const KernelGlobalsCPU *kg, + ccl_global float *render_buffer, + int x, + int y, + float threshold, + bool reset, + int offset, + int stride); + +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg, + ccl_global float *render_buffer, + int y, + int start_x, + int width, + int offset, + int stride); +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg, + ccl_global float *render_buffer, + int x, + int start_y, + int height, + int offset, + int stride); + +/* -------------------------------------------------------------------- + * Cryptomatte. + */ + +void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg, + ccl_global float *render_buffer, + int pixel_index); + +#undef KERNEL_ARCH diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h new file mode 100644 index 00000000000..ba777062113 --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h @@ -0,0 +1,243 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common implementation part of all CPU kernels. + * + * The idea is that particular .cpp files sets needed optimization flags and + * simply includes this file without worry of copying actual implementation over. + */ + +#pragma once + +// clang-format off +#include "kernel/device/cpu/compat.h" + +#ifndef KERNEL_STUB +# include "kernel/device/cpu/globals.h" +# include "kernel/device/cpu/image.h" + +# include "kernel/integrator/integrator_state.h" +# include "kernel/integrator/integrator_state_flow.h" +# include "kernel/integrator/integrator_state_util.h" + +# include "kernel/integrator/integrator_init_from_camera.h" +# include "kernel/integrator/integrator_init_from_bake.h" +# include "kernel/integrator/integrator_intersect_closest.h" +# include "kernel/integrator/integrator_intersect_shadow.h" +# include "kernel/integrator/integrator_intersect_subsurface.h" +# include "kernel/integrator/integrator_intersect_volume_stack.h" +# include "kernel/integrator/integrator_shade_background.h" +# include "kernel/integrator/integrator_shade_light.h" +# include "kernel/integrator/integrator_shade_shadow.h" +# include "kernel/integrator/integrator_shade_surface.h" +# include "kernel/integrator/integrator_shade_volume.h" +# include "kernel/integrator/integrator_megakernel.h" + +# include "kernel/kernel_film.h" +# include "kernel/kernel_adaptive_sampling.h" +# include "kernel/kernel_bake.h" +# include "kernel/kernel_id_passes.h" + +#else +# define STUB_ASSERT(arch, name) \ + assert(!(#name " kernel stub for architecture " #arch " was called!")) +#endif /* KERNEL_STUB */ +// clang-format on + +CCL_NAMESPACE_BEGIN + +/* -------------------------------------------------------------------- + * Integrator. + */ + +#ifdef KERNEL_STUB +# define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0) +#else +# define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__) +#endif + +/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so + * that it does not contain unused fields. */ +#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \ + bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \ + IntegratorStateCPU *state, \ + KernelWorkTile *tile, \ + ccl_global float *render_buffer) \ + { \ + return KERNEL_INVOKE( \ + name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \ + } + +#define DEFINE_INTEGRATOR_KERNEL(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \ + IntegratorStateCPU *state) \ + { \ + KERNEL_INVOKE(name, kg, state); \ + } + +#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \ + const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \ + { \ + KERNEL_INVOKE(name, kg, state, render_buffer); \ + } + +#define DEFINE_INTEGRATOR_SHADOW_KERNEL(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \ + IntegratorStateCPU *state) \ + { \ + KERNEL_INVOKE(name, kg, &state->shadow); \ + } + +#define DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \ + const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \ + { \ + KERNEL_INVOKE(name, kg, &state->shadow, render_buffer); \ + } + +DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera) +DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake) +DEFINE_INTEGRATOR_KERNEL(intersect_closest) +DEFINE_INTEGRATOR_KERNEL(intersect_subsurface) +DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack) +DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background) +DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light) +DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface) +DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume) +DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel) +DEFINE_INTEGRATOR_SHADOW_KERNEL(intersect_shadow) +DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(shade_shadow) + +/* -------------------------------------------------------------------- + * Shader evaluation. + */ + +void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg, + const KernelShaderEvalInput *input, + float *output, + const int offset) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader_eval_displace); +#else + kernel_displace_evaluate(kg, input, output, offset); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg, + const KernelShaderEvalInput *input, + float *output, + const int offset) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader_eval_background); +#else + kernel_background_evaluate(kg, input, output, offset); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)( + const KernelGlobalsCPU *kg, + const KernelShaderEvalInput *input, + float *output, + const int offset) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader_eval_curve_shadow_transparency); +#else + kernel_curve_shadow_transparency_evaluate(kg, input, output, offset); +#endif +} + +/* -------------------------------------------------------------------- + * Adaptive sampling. + */ + +bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)( + const KernelGlobalsCPU *kg, + ccl_global float *render_buffer, + int x, + int y, + float threshold, + bool reset, + int offset, + int stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check); + return false; +#else + return kernel_adaptive_sampling_convergence_check( + kg, render_buffer, x, y, threshold, reset, offset, stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg, + ccl_global float *render_buffer, + int y, + int start_x, + int width, + int offset, + int stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x); +#else + kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg, + ccl_global float *render_buffer, + int x, + int start_y, + int height, + int offset, + int stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y); +#else + kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride); +#endif +} + +/* -------------------------------------------------------------------- + * Cryptomatte. + */ + +void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg, + ccl_global float *render_buffer, + int pixel_index) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess); +#else + kernel_cryptomatte_post(kg, render_buffer, pixel_index); +#endif +} + +#undef KERNEL_INVOKE +#undef DEFINE_INTEGRATOR_KERNEL +#undef DEFINE_INTEGRATOR_SHADE_KERNEL +#undef DEFINE_INTEGRATOR_INIT_KERNEL + +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp new file mode 100644 index 00000000000..220768036ab --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug T36316. */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/device/cpu/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp new file mode 100644 index 00000000000..90c05113cbe --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug T36316. */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/device/cpu/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp new file mode 100644 index 00000000000..fb85ef5b0d0 --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug T36316. */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/device/cpu/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp new file mode 100644 index 00000000000..87baf04258a --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug T36316. */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/device/cpu/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp new file mode 100644 index 00000000000..bb421d58815 --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug T36316. */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/device/cpu/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h new file mode 100644 index 00000000000..8a50eb1a3d5 --- /dev/null +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -0,0 +1,141 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define __KERNEL_GPU__ +#define __KERNEL_CUDA__ +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#ifndef ATTR_FALLTHROUGH +# define ATTR_FALLTHROUGH +#endif + +/* Manual definitions so we can compile without CUDA toolkit. */ + +#ifdef __CUDACC_RTC__ +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +#else +# include <stdint.h> +#endif + +#ifdef CYCLES_CUBIN_CC +# define FLT_MIN 1.175494350822287507969e-38f +# define FLT_MAX 340282346638528859811704183484516925440.0f +# define FLT_EPSILON 1.192092896e-07F +#endif + +/* Qualifiers */ + +#define ccl_device __device__ __inline__ +#if __CUDA_ARCH__ < 500 +# define ccl_device_inline __device__ __forceinline__ +# define ccl_device_forceinline __device__ __forceinline__ +#else +# define ccl_device_inline __device__ __inline__ +# define ccl_device_forceinline __device__ __forceinline__ +#endif +#define ccl_device_noinline __device__ __noinline__ +#define ccl_device_noinline_cpu ccl_device +#define ccl_global +#define ccl_static_constant __constant__ +#define ccl_device_constant __constant__ __device__ +#define ccl_constant const +#define ccl_gpu_shared __shared__ +#define ccl_private +#define ccl_may_alias +#define ccl_restrict __restrict__ +#define ccl_loop_no_unroll +#define ccl_align(n) __align__(n) +#define ccl_optional_struct_init + +/* No assert supported for CUDA */ + +#define kernel_assert(cond) + +/* GPU thread, block, grid size and index */ + +#define ccl_gpu_thread_idx_x (threadIdx.x) +#define ccl_gpu_block_dim_x (blockDim.x) +#define ccl_gpu_block_idx_x (blockIdx.x) +#define ccl_gpu_grid_dim_x (gridDim.x) +#define ccl_gpu_warp_size (warpSize) + +#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) +#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) + +/* GPU warp synchronization. */ + +#define ccl_gpu_syncthreads() __syncthreads() +#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) +#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) +#define ccl_gpu_popc(x) __popc(x) + +/* GPU texture objects */ + +typedef unsigned long long CUtexObject; +typedef CUtexObject ccl_gpu_tex_object; + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj, + const float x, + const float y) +{ + return tex2D<T>(texobj, x, y); +} + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj, + const float x, + const float y, + const float z) +{ + return tex3D<T>(texobj, x, y, z); +} + +/* Use fast math functions */ + +#define cosf(x) __cosf(((float)(x))) +#define sinf(x) __sinf(((float)(x))) +#define powf(x, y) __powf(((float)(x)), ((float)(y))) +#define tanf(x) __tanf(((float)(x))) +#define logf(x) __logf(((float)(x))) +#define expf(x) __expf(((float)(x))) + +/* Half */ + +typedef unsigned short half; + +__device__ half __float2half(const float f) +{ + half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); + return val; +} + +__device__ float __half2float(const half h) +{ + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h)); + return val; +} + +/* Types */ + +#include "util/util_half.h" +#include "util/util_types.h" diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h new file mode 100644 index 00000000000..46196dcdb51 --- /dev/null +++ b/intern/cycles/kernel/device/cuda/config.h @@ -0,0 +1,114 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Device data taken from CUDA occupancy calculator. + * + * Terminology + * - CUDA GPUs have multiple streaming multiprocessors + * - Each multiprocessor executes multiple thread blocks + * - Each thread block contains a number of threads, also known as the block size + * - Multiprocessors have a fixed number of registers, and the amount of registers + * used by each threads limits the number of threads per block. + */ + +/* 3.0 and 3.5 */ +#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 +# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16 +# define GPU_BLOCK_MAX_THREADS 1024 +# define GPU_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define GPU_KERNEL_BLOCK_NUM_THREADS 256 +# define GPU_KERNEL_MAX_REGISTERS 63 + +/* 3.2 */ +#elif __CUDA_ARCH__ == 320 +# define GPU_MULTIPRESSOR_MAX_REGISTERS 32768 +# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16 +# define GPU_BLOCK_MAX_THREADS 1024 +# define GPU_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define GPU_KERNEL_BLOCK_NUM_THREADS 256 +# define GPU_KERNEL_MAX_REGISTERS 63 + +/* 3.7 */ +#elif __CUDA_ARCH__ == 370 +# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16 +# define GPU_BLOCK_MAX_THREADS 1024 +# define GPU_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define GPU_KERNEL_BLOCK_NUM_THREADS 256 +# define GPU_KERNEL_MAX_REGISTERS 63 + +/* 5.x, 6.x */ +#elif __CUDA_ARCH__ <= 699 +# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32 +# define GPU_BLOCK_MAX_THREADS 1024 +# define GPU_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define GPU_KERNEL_BLOCK_NUM_THREADS 256 +/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of + * registers */ +# if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600 +# define GPU_KERNEL_MAX_REGISTERS 64 +# else +# define GPU_KERNEL_MAX_REGISTERS 48 +# endif + +/* 7.x, 8.x */ +#elif __CUDA_ARCH__ <= 899 +# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32 +# define GPU_BLOCK_MAX_THREADS 1024 +# define GPU_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define GPU_KERNEL_BLOCK_NUM_THREADS 512 +# define GPU_KERNEL_MAX_REGISTERS 96 + +/* unknown architecture */ +#else +# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" +#endif + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ + +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ + extern "C" __global__ void __launch_bounds__(block_num_threads, \ + GPU_MULTIPRESSOR_MAX_REGISTERS / \ + (block_num_threads * thread_num_registers)) + +/* sanity checks */ + +#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \ + GPU_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h new file mode 100644 index 00000000000..2c187cf8a23 --- /dev/null +++ b/intern/cycles/kernel/device/cuda/globals.h @@ -0,0 +1,49 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#pragma once + +#include "kernel/kernel_profiling.h" +#include "kernel/kernel_types.h" + +#include "kernel/integrator/integrator_state.h" + +CCL_NAMESPACE_BEGIN + +/* Not actually used, just a NULL pointer that gets passed everywhere, which we + * hope gets optimized out by the compiler. */ +struct KernelGlobalsGPU { + int unused[1]; +}; +typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals; + +/* Global scene data and textures */ +__constant__ KernelData __data; +#define KERNEL_TEX(type, name) const __constant__ __device__ type *name; +#include "kernel/kernel_textures.h" + +/* Integrator state */ +__constant__ IntegratorStateGPU __integrator_state; + +/* Abstraction macros */ +#define kernel_data __data +#define kernel_tex_fetch(t, index) t[(index)] +#define kernel_tex_array(t) (t) +#define kernel_integrator_state __integrator_state + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu new file mode 100644 index 00000000000..e26fe243642 --- /dev/null +++ b/intern/cycles/kernel/device/cuda/kernel.cu @@ -0,0 +1,28 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA kernel entry points */ + +#ifdef __CUDA_ARCH__ + +# include "kernel/device/cuda/compat.h" +# include "kernel/device/cuda/config.h" +# include "kernel/device/cuda/globals.h" + +# include "kernel/device/gpu/image.h" +# include "kernel/device/gpu/kernel.h" + +#endif diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h new file mode 100644 index 00000000000..95a37c693ae --- /dev/null +++ b/intern/cycles/kernel/device/gpu/image.h @@ -0,0 +1,278 @@ +/* + * Copyright 2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +#ifdef WITH_NANOVDB +# define NDEBUG /* Disable "assert" in device code */ +# define NANOVDB_USE_INTRINSICS +# include "nanovdb/NanoVDB.h" +# include "nanovdb/util/SampleFromVoxels.h" +#endif + +/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */ +ccl_device float cubic_w0(float a) +{ + return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f); +} +ccl_device float cubic_w1(float a) +{ + return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f); +} +ccl_device float cubic_w2(float a) +{ + return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f); +} +ccl_device float cubic_w3(float a) +{ + return (1.0f / 6.0f) * (a * a * a); +} + +/* g0 and g1 are the two amplitude functions. */ +ccl_device float cubic_g0(float a) +{ + return cubic_w0(a) + cubic_w1(a); +} +ccl_device float cubic_g1(float a) +{ + return cubic_w2(a) + cubic_w3(a); +} + +/* h0 and h1 are the two offset functions */ +ccl_device float cubic_h0(float a) +{ + return (cubic_w1(a) / cubic_g0(a)) - 1.0f; +} +ccl_device float cubic_h1(float a) +{ + return (cubic_w3(a) / cubic_g1(a)) + 1.0f; +} + +/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */ +template<typename T> +ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y) +{ + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; + + x = (x * info.width) - 0.5f; + y = (y * info.height) - 0.5f; + + float px = floorf(x); + float py = floorf(y); + float fx = x - px; + float fy = y - py; + + float g0x = cubic_g0(fx); + float g1x = cubic_g1(fx); + /* Note +0.5 offset to compensate for CUDA linear filtering convention. */ + float x0 = (px + cubic_h0(fx) + 0.5f) / info.width; + float x1 = (px + cubic_h1(fx) + 0.5f) / info.width; + float y0 = (py + cubic_h0(fy) + 0.5f) / info.height; + float y1 = (py + cubic_h1(fy) + 0.5f) / info.height; + + return cubic_g0(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y0) + + g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y0)) + + cubic_g1(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y1) + + g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y1)); +} + +/* Fast tricubic texture lookup using 8 trilinear lookups. */ +template<typename T> +ccl_device_noinline T +kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z) +{ + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; + + x = (x * info.width) - 0.5f; + y = (y * info.height) - 0.5f; + z = (z * info.depth) - 0.5f; + + float px = floorf(x); + float py = floorf(y); + float pz = floorf(z); + float fx = x - px; + float fy = y - py; + float fz = z - pz; + + float g0x = cubic_g0(fx); + float g1x = cubic_g1(fx); + float g0y = cubic_g0(fy); + float g1y = cubic_g1(fy); + float g0z = cubic_g0(fz); + float g1z = cubic_g1(fz); + + /* Note +0.5 offset to compensate for CUDA linear filtering convention. */ + float x0 = (px + cubic_h0(fx) + 0.5f) / info.width; + float x1 = (px + cubic_h1(fx) + 0.5f) / info.width; + float y0 = (py + cubic_h0(fy) + 0.5f) / info.height; + float y1 = (py + cubic_h1(fy) + 0.5f) / info.height; + float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth; + float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth; + + return g0z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z0) + + g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z0)) + + g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z0) + + g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z0))) + + g1z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z1) + + g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z1)) + + g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z1) + + g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z1))); +} + +#ifdef WITH_NANOVDB +template<typename T, typename S> +ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, float z) +{ + float px = floorf(x); + float py = floorf(y); + float pz = floorf(z); + float fx = x - px; + float fy = y - py; + float fz = z - pz; + + float g0x = cubic_g0(fx); + float g1x = cubic_g1(fx); + float g0y = cubic_g0(fy); + float g1y = cubic_g1(fy); + float g0z = cubic_g0(fz); + float g1z = cubic_g1(fz); + + float x0 = px + cubic_h0(fx); + float x1 = px + cubic_h1(fx); + float y0 = py + cubic_h0(fy); + float y1 = py + cubic_h1(fy); + float z0 = pz + cubic_h0(fz); + float z1 = pz + cubic_h1(fz); + + using namespace nanovdb; + + return g0z * (g0y * (g0x * s(Vec3f(x0, y0, z0)) + g1x * s(Vec3f(x1, y0, z0))) + + g1y * (g0x * s(Vec3f(x0, y1, z0)) + g1x * s(Vec3f(x1, y1, z0)))) + + g1z * (g0y * (g0x * s(Vec3f(x0, y0, z1)) + g1x * s(Vec3f(x1, y0, z1))) + + g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1)))); +} + +template<typename T> +ccl_device_noinline T kernel_tex_image_interp_nanovdb( + const TextureInfo &info, float x, float y, float z, uint interpolation) +{ + using namespace nanovdb; + + NanoGrid<T> *const grid = (NanoGrid<T> *)info.data; + typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType; + AccessorType acc = grid->getAccessor(); + + switch (interpolation) { + case INTERPOLATION_CLOSEST: + return SampleFromVoxels<AccessorType, 0, false>(acc)(Vec3f(x, y, z)); + case INTERPOLATION_LINEAR: + return SampleFromVoxels<AccessorType, 1, false>(acc)(Vec3f(x - 0.5f, y - 0.5f, z - 0.5f)); + default: + SampleFromVoxels<AccessorType, 1, false> s(acc); + return kernel_tex_image_interp_tricubic_nanovdb<T>(s, x - 0.5f, y - 0.5f, z - 0.5f); + } +} +#endif + +ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y) +{ + const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + + /* float4, byte4, ushort4 and half4 */ + const int texture_type = info.data_type; + if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) { + if (info.interpolation == INTERPOLATION_CUBIC) { + return kernel_tex_image_interp_bicubic<float4>(info, x, y); + } + else { + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; + return ccl_gpu_tex_object_read_2D<float4>(tex, x, y); + } + } + /* float, byte and half */ + else { + float f; + + if (info.interpolation == INTERPOLATION_CUBIC) { + f = kernel_tex_image_interp_bicubic<float>(info, x, y); + } + else { + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; + f = ccl_gpu_tex_object_read_2D<float>(tex, x, y); + } + + return make_float4(f, f, f, 1.0f); + } +} + +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg, + int id, + float3 P, + InterpolationType interp) +{ + const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + + if (info.use_transform_3d) { + P = transform_point(&info.transform_3d, P); + } + + const float x = P.x; + const float y = P.y; + const float z = P.z; + + uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp; + const int texture_type = info.data_type; + +#ifdef WITH_NANOVDB + if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) { + float f = kernel_tex_image_interp_nanovdb<float>(info, x, y, z, interpolation); + return make_float4(f, f, f, 1.0f); + } + if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + nanovdb::Vec3f f = kernel_tex_image_interp_nanovdb<nanovdb::Vec3f>( + info, x, y, z, interpolation); + return make_float4(f[0], f[1], f[2], 1.0f); + } +#endif + if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) { + if (interpolation == INTERPOLATION_CUBIC) { + return kernel_tex_image_interp_tricubic<float4>(info, x, y, z); + } + else { + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; + return ccl_gpu_tex_object_read_3D<float4>(tex, x, y, z); + } + } + else { + float f; + + if (interpolation == INTERPOLATION_CUBIC) { + f = kernel_tex_image_interp_tricubic<float>(info, x, y, z); + } + else { + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; + f = ccl_gpu_tex_object_read_3D<float>(tex, x, y, z); + } + + return make_float4(f, f, f, 1.0f); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h new file mode 100644 index 00000000000..335cb1ec0c0 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -0,0 +1,910 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Common GPU kernels. */ + +#include "kernel/device/gpu/parallel_active_index.h" +#include "kernel/device/gpu/parallel_prefix_sum.h" +#include "kernel/device/gpu/parallel_sorted_index.h" + +#include "kernel/integrator/integrator_state.h" +#include "kernel/integrator/integrator_state_flow.h" +#include "kernel/integrator/integrator_state_util.h" + +#include "kernel/integrator/integrator_init_from_bake.h" +#include "kernel/integrator/integrator_init_from_camera.h" +#include "kernel/integrator/integrator_intersect_closest.h" +#include "kernel/integrator/integrator_intersect_shadow.h" +#include "kernel/integrator/integrator_intersect_subsurface.h" +#include "kernel/integrator/integrator_intersect_volume_stack.h" +#include "kernel/integrator/integrator_shade_background.h" +#include "kernel/integrator/integrator_shade_light.h" +#include "kernel/integrator/integrator_shade_shadow.h" +#include "kernel/integrator/integrator_shade_surface.h" +#include "kernel/integrator/integrator_shade_volume.h" + +#include "kernel/kernel_adaptive_sampling.h" +#include "kernel/kernel_bake.h" +#include "kernel/kernel_film.h" +#include "kernel/kernel_work_stealing.h" + +/* -------------------------------------------------------------------- + * Integrator. + */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_reset(int num_states) +{ + const int state = ccl_gpu_global_id_x(); + + if (state < num_states) { + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles, + const int num_tiles, + float *render_buffer, + const int max_tile_work_size) +{ + const int work_index = ccl_gpu_global_id_x(); + + if (work_index >= max_tile_work_size * num_tiles) { + return; + } + + const int tile_index = work_index / max_tile_work_size; + const int tile_work_index = work_index - tile_index * max_tile_work_size; + + const KernelWorkTile *tile = &tiles[tile_index]; + + if (tile_work_index >= tile->work_size) { + return; + } + + const int state = tile->path_index_offset + tile_work_index; + + uint x, y, sample; + get_work_pixel(tile, tile_work_index, &x, &y, &sample); + + integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample); +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles, + const int num_tiles, + float *render_buffer, + const int max_tile_work_size) +{ + const int work_index = ccl_gpu_global_id_x(); + + if (work_index >= max_tile_work_size * num_tiles) { + return; + } + + const int tile_index = work_index / max_tile_work_size; + const int tile_work_index = work_index - tile_index * max_tile_work_size; + + const KernelWorkTile *tile = &tiles[tile_index]; + + if (tile_work_index >= tile->work_size) { + return; + } + + const int state = tile->path_index_offset + tile_work_index; + + uint x, y, sample; + get_work_pixel(tile, tile_work_index, &x, &y, &sample); + + integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample); +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_intersect_closest(NULL, state); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_intersect_shadow(NULL, state); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_intersect_subsurface(NULL, state); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_intersect_volume_stack(NULL, state); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_background(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_background(NULL, state, render_buffer); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_light(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_light(NULL, state, render_buffer); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_shadow(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_shadow(NULL, state, render_buffer); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_surface(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_surface(NULL, state, render_buffer); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_surface_raytrace(NULL, state, render_buffer); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_volume(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_volume(NULL, state, render_buffer); + } +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_queued_paths_array(int num_states, + int *indices, + int *num_indices, + int kernel) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [kernel](const int state) { + return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_queued_shadow_paths_array(int num_states, + int *indices, + int *num_indices, + int kernel) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [kernel](const int state) { + return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [](const int state) { + return (INTEGRATOR_STATE(state, path, queued_kernel) != 0); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_terminated_paths_array(int num_states, + int *indices, + int *num_indices, + int indices_offset) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices + indices_offset, num_indices, [](const int state) { + return (INTEGRATOR_STATE(state, path, queued_kernel) == 0); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_terminated_shadow_paths_array(int num_states, + int *indices, + int *num_indices, + int indices_offset) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices + indices_offset, num_indices, [](const int state) { + return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_sorted_paths_array(int num_states, + int num_states_limit, + int *indices, + int *num_indices, + int *key_counter, + int *key_prefix_sum, + int kernel) +{ + gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, + num_states_limit, + indices, + num_indices, + key_counter, + key_prefix_sum, + [kernel](const int state) { + return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ? + INTEGRATOR_STATE(state, path, shader_sort_key) : + GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_paths_array(int num_states, + int *indices, + int *num_indices, + int num_active_paths) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [num_active_paths](const int state) { + return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_states(const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int from_state = active_terminated_states[active_states_offset + global_index]; + const int to_state = active_terminated_states[terminated_states_offset + global_index]; + + integrator_state_move(NULL, to_state, from_state); + } +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_shadow_paths_array(int num_states, + int *indices, + int *num_indices, + int num_active_paths) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [num_active_paths](const int state) { + return (state >= num_active_paths) && + (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int from_state = active_terminated_states[active_states_offset + global_index]; + const int to_state = active_terminated_states[terminated_states_offset + global_index]; + + integrator_shadow_state_move(NULL, to_state, from_state); + } +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) + kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values) +{ + gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>( + counter, prefix_sum, num_values); +} + +/* -------------------------------------------------------------------- + * Adaptive sampling. + */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer, + int sx, + int sy, + int sw, + int sh, + float threshold, + bool reset, + int offset, + int stride, + uint *num_active_pixels) +{ + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / sw; + const int x = work_index - y * sw; + + bool converged = true; + + if (x < sw && y < sh) { + converged = kernel_adaptive_sampling_convergence_check( + nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride); + } + + /* NOTE: All threads specified in the mask must execute the intrinsic. */ + const uint num_active_pixels_mask = ccl_gpu_ballot(!converged); + const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; + if (lane_id == 0) { + atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask)); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_adaptive_sampling_filter_x( + float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride) +{ + const int y = ccl_gpu_global_id_x(); + + if (y < sh) { + kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_adaptive_sampling_filter_y( + float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride) +{ + const int x = ccl_gpu_global_id_x(); + + if (x < sw) { + kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride); + } +} + +/* -------------------------------------------------------------------- + * Cryptomatte. + */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels) +{ + const int pixel_index = ccl_gpu_global_id_x(); + + if (pixel_index < num_pixels) { + kernel_cryptomatte_post(nullptr, render_buffer, pixel_index); + } +} + +/* -------------------------------------------------------------------- + * Film. + */ + +/* Common implementation for float destination. */ +template<typename Processor> +ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert, + float *pixels, + float *render_buffer, + int num_pixels, + int width, + int offset, + int stride, + int dst_offset, + int dst_stride, + const Processor &processor) +{ + const int render_pixel_index = ccl_gpu_global_id_x(); + if (render_pixel_index >= num_pixels) { + return; + } + + const int x = render_pixel_index % width; + const int y = render_pixel_index / width; + + ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride + + y * stride * kfilm_convert->pass_stride; + + ccl_global float *pixel = pixels + + (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride; + + processor(kfilm_convert, buffer, pixel); +} + +/* Common implementation for half4 destination and 4-channel input pass. */ +template<typename Processor> +ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba( + const KernelFilmConvert *kfilm_convert, + uchar4 *rgba, + float *render_buffer, + int num_pixels, + int width, + int offset, + int stride, + int rgba_offset, + int rgba_stride, + const Processor &processor) +{ + const int render_pixel_index = ccl_gpu_global_id_x(); + if (render_pixel_index >= num_pixels) { + return; + } + + const int x = render_pixel_index % width; + const int y = render_pixel_index / width; + + ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride + + y * stride * kfilm_convert->pass_stride; + + float pixel[4]; + processor(kfilm_convert, buffer, pixel); + + film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel); + + ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x; + *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); +} + +/* Common implementation for half4 destination and 3-channel input pass. */ +template<typename Processor> +ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb( + const KernelFilmConvert *kfilm_convert, + uchar4 *rgba, + float *render_buffer, + int num_pixels, + int width, + int offset, + int stride, + int rgba_offset, + int rgba_stride, + const Processor &processor) +{ + kernel_gpu_film_convert_half_rgba_common_rgba( + kfilm_convert, + rgba, + render_buffer, + num_pixels, + width, + offset, + stride, + rgba_offset, + rgba_stride, + [&processor](const KernelFilmConvert *kfilm_convert, + ccl_global const float *buffer, + float *pixel_rgba) { + processor(kfilm_convert, buffer, pixel_rgba); + pixel_rgba[3] = 1.0f; + }); +} + +/* Common implementation for half4 destination and single channel input pass. */ +template<typename Processor> +ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value( + const KernelFilmConvert *kfilm_convert, + uchar4 *rgba, + float *render_buffer, + int num_pixels, + int width, + int offset, + int stride, + int rgba_offset, + int rgba_stride, + const Processor &processor) +{ + kernel_gpu_film_convert_half_rgba_common_rgba( + kfilm_convert, + rgba, + render_buffer, + num_pixels, + width, + offset, + stride, + rgba_offset, + rgba_stride, + [&processor](const KernelFilmConvert *kfilm_convert, + ccl_global const float *buffer, + float *pixel_rgba) { + float value; + processor(kfilm_convert, buffer, &value); + + pixel_rgba[0] = value; + pixel_rgba[1] = value; + pixel_rgba[2] = value; + pixel_rgba[3] = 1.0f; + }); +} + +#define KERNEL_FILM_CONVERT_PROC(name) \ + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name + +#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \ + KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \ + (const KernelFilmConvert kfilm_convert, \ + float *pixels, \ + float *render_buffer, \ + int num_pixels, \ + int width, \ + int offset, \ + int stride, \ + int rgba_offset, \ + int rgba_stride) \ + { \ + kernel_gpu_film_convert_common(&kfilm_convert, \ + pixels, \ + render_buffer, \ + num_pixels, \ + width, \ + offset, \ + stride, \ + rgba_offset, \ + rgba_stride, \ + film_get_pass_pixel_##variant); \ + } \ + KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \ + (const KernelFilmConvert kfilm_convert, \ + uchar4 *rgba, \ + float *render_buffer, \ + int num_pixels, \ + int width, \ + int offset, \ + int stride, \ + int rgba_offset, \ + int rgba_stride) \ + { \ + kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \ + rgba, \ + render_buffer, \ + num_pixels, \ + width, \ + offset, \ + stride, \ + rgba_offset, \ + rgba_stride, \ + film_get_pass_pixel_##variant); \ + } + +KERNEL_FILM_CONVERT_DEFINE(depth, value) +KERNEL_FILM_CONVERT_DEFINE(mist, value) +KERNEL_FILM_CONVERT_DEFINE(sample_count, value) +KERNEL_FILM_CONVERT_DEFINE(float, value) + +KERNEL_FILM_CONVERT_DEFINE(light_path, rgb) +KERNEL_FILM_CONVERT_DEFINE(float3, rgb) + +KERNEL_FILM_CONVERT_DEFINE(motion, rgba) +KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba) +KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba) +KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba) +KERNEL_FILM_CONVERT_DEFINE(combined, rgba) +KERNEL_FILM_CONVERT_DEFINE(float4, rgba) + +#undef KERNEL_FILM_CONVERT_DEFINE +#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE +#undef KERNEL_FILM_CONVERT_PROC + +/* -------------------------------------------------------------------- + * Shader evaluation. + */ + +/* Displacement */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input, + float *output, + const int offset, + const int work_size) +{ + int i = ccl_gpu_global_id_x(); + if (i < work_size) { + kernel_displace_evaluate(NULL, input, output, offset + i); + } +} + +/* Background */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_shader_eval_background(KernelShaderEvalInput *input, + float *output, + const int offset, + const int work_size) +{ + int i = ccl_gpu_global_id_x(); + if (i < work_size) { + kernel_background_evaluate(NULL, input, output, offset + i); + } +} + +/* Curve Shadow Transparency */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input, + float *output, + const int offset, + const int work_size) +{ + int i = ccl_gpu_global_id_x(); + if (i < work_size) { + kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i); + } +} + +/* -------------------------------------------------------------------- + * Denoising. + */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_filter_color_preprocess(float *render_buffer, + int full_x, + int full_y, + int width, + int height, + int offset, + int stride, + int pass_stride, + int pass_denoised) +{ + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / width; + const int x = work_index - y * width; + + if (x >= width || y >= height) { + return; + } + + const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; + float *buffer = render_buffer + render_pixel_index * pass_stride; + + float *color_out = buffer + pass_denoised; + color_out[0] = clamp(color_out[0], 0.0f, 10000.0f); + color_out[1] = clamp(color_out[1], 0.0f, 10000.0f); + color_out[2] = clamp(color_out[2], 0.0f, 10000.0f); +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_filter_guiding_preprocess(float *guiding_buffer, + int guiding_pass_stride, + int guiding_pass_albedo, + int guiding_pass_normal, + const float *render_buffer, + int render_offset, + int render_stride, + int render_pass_stride, + int render_pass_sample_count, + int render_pass_denoising_albedo, + int render_pass_denoising_normal, + int full_x, + int full_y, + int width, + int height, + int num_samples) +{ + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / width; + const int x = work_index - y * width; + + if (x >= width || y >= height) { + return; + } + + const uint64_t guiding_pixel_index = x + y * width; + float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; + + const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride; + const float *buffer = render_buffer + render_pixel_index * render_pass_stride; + + float pixel_scale; + if (render_pass_sample_count == PASS_UNUSED) { + pixel_scale = 1.0f / num_samples; + } + else { + pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]); + } + + /* Albedo pass. */ + if (guiding_pass_albedo != PASS_UNUSED) { + kernel_assert(render_pass_denoising_albedo != PASS_UNUSED); + + const float *aledo_in = buffer + render_pass_denoising_albedo; + float *albedo_out = guiding_pixel + guiding_pass_albedo; + + albedo_out[0] = aledo_in[0] * pixel_scale; + albedo_out[1] = aledo_in[1] * pixel_scale; + albedo_out[2] = aledo_in[2] * pixel_scale; + } + + /* Normal pass. */ + if (render_pass_denoising_normal != PASS_UNUSED) { + kernel_assert(render_pass_denoising_normal != PASS_UNUSED); + + const float *normal_in = buffer + render_pass_denoising_normal; + float *normal_out = guiding_pixel + guiding_pass_normal; + + normal_out[0] = normal_in[0] * pixel_scale; + normal_out[1] = normal_in[1] * pixel_scale; + normal_out[2] = normal_in[2] * pixel_scale; + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer, + int guiding_pass_stride, + int guiding_pass_albedo, + int width, + int height) +{ + kernel_assert(guiding_pass_albedo != PASS_UNUSED); + + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / width; + const int x = work_index - y * width; + + if (x >= width || y >= height) { + return; + } + + const uint64_t guiding_pixel_index = x + y * width; + float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; + + float *albedo_out = guiding_pixel + guiding_pass_albedo; + + albedo_out[0] = 0.5f; + albedo_out[1] = 0.5f; + albedo_out[2] = 0.5f; +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_filter_color_postprocess(float *render_buffer, + int full_x, + int full_y, + int width, + int height, + int offset, + int stride, + int pass_stride, + int num_samples, + int pass_noisy, + int pass_denoised, + int pass_sample_count, + int num_components, + bool use_compositing) +{ + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / width; + const int x = work_index - y * width; + + if (x >= width || y >= height) { + return; + } + + const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; + float *buffer = render_buffer + render_pixel_index * pass_stride; + + float pixel_scale; + if (pass_sample_count == PASS_UNUSED) { + pixel_scale = num_samples; + } + else { + pixel_scale = __float_as_uint(buffer[pass_sample_count]); + } + + float *denoised_pixel = buffer + pass_denoised; + + denoised_pixel[0] *= pixel_scale; + denoised_pixel[1] *= pixel_scale; + denoised_pixel[2] *= pixel_scale; + + if (num_components == 3) { + /* Pass without alpha channel. */ + } + else if (!use_compositing) { + /* Currently compositing passes are either 3-component (derived by dividing light passes) + * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it + * simplifies logic and avoids extra memory allocation. */ + const float *noisy_pixel = buffer + pass_noisy; + denoised_pixel[3] = noisy_pixel[3]; + } + else { + /* Assigning to zero since this is a default alpha value for 3-component passes, and it + * is an opaque pixel for 4 component passes. */ + + denoised_pixel[3] = 0; + } +} + +/* -------------------------------------------------------------------- + * Shadow catcher. + */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states, + uint *num_possible_splits) +{ + const int state = ccl_gpu_global_id_x(); + + bool can_split = false; + + if (state < num_states) { + can_split = kernel_shadow_catcher_path_can_split(nullptr, state); + } + + /* NOTE: All threads specified in the mask must execute the intrinsic. */ + const uint can_split_mask = ccl_gpu_ballot(can_split); + const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; + if (lane_id == 0) { + atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask)); + } +} diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h new file mode 100644 index 00000000000..db4a4bf71e0 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -0,0 +1,87 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Given an array of states, build an array of indices for which the states + * are active. + * + * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */ + +#include "util/util_atomic.h" + +#ifdef __HIP__ +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 +#endif + +template<uint blocksize, typename IsActiveOp> +__device__ void gpu_parallel_active_index_array(const uint num_states, + int *indices, + int *num_indices, + IsActiveOp is_active_op) +{ + extern ccl_gpu_shared int warp_offset[]; + + const uint thread_index = ccl_gpu_thread_idx_x; + const uint thread_warp = thread_index % ccl_gpu_warp_size; + + const uint warp_index = thread_index / ccl_gpu_warp_size; + const uint num_warps = blocksize / ccl_gpu_warp_size; + + /* Test if state corresponding to this thread is active. */ + const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index; + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; + + /* For each thread within a warp compute how many other active states precede it. */ + const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp); + const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask); + + /* Last thread in warp stores number of active states for each warp. */ + if (thread_warp == ccl_gpu_warp_size - 1) { + warp_offset[warp_index] = thread_offset + is_active; + } + + ccl_gpu_syncthreads(); + + /* Last thread in block converts per-warp sizes to offsets, increments global size of + * index array and gets offset to write to. */ + if (thread_index == blocksize - 1) { + /* TODO: parallelize this. */ + int offset = 0; + for (int i = 0; i < num_warps; i++) { + int num_active = warp_offset[i]; + warp_offset[i] = offset; + offset += num_active; + } + + const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; + warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); + } + + ccl_gpu_syncthreads(); + + /* Write to index array. */ + if (is_active) { + const uint block_offset = warp_offset[num_warps]; + indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h new file mode 100644 index 00000000000..aabe6e2e27a --- /dev/null +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -0,0 +1,52 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Parallel prefix sum. + * + * TODO: actually make this work in parallel. + * + * This is used for an array the size of the number of shaders in the scene + * which is not usually huge, so might not be a significant bottleneck. */ + +#include "util/util_atomic.h" + +#ifdef __HIP__ +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 +#endif + +template<uint blocksize> +__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values) +{ + if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) { + return; + } + + int offset = 0; + for (int i = 0; i < num_values; i++) { + const int new_offset = offset + counter[i]; + prefix_sum[i] = offset; + counter[i] = 0; + offset = new_offset; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h new file mode 100644 index 00000000000..b60dceb2ed0 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h @@ -0,0 +1,87 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Parallel sum of array input_data with size n into output_sum. + * + * Adapted from "Optimizing Parallel Reduction in GPU", Mark Harris. + * + * This version adds multiple elements per thread sequentially. This reduces + * the overall cost of the algorithm while keeping the work complexity O(n) and + * the step complexity O(log n). (Brent's Theorem optimization) */ + +#ifdef __HIP__ +# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512 +#endif + +template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp> +__device__ void gpu_parallel_sum( + const InputT *input_data, const uint n, OutputT *output_sum, OutputT zero, ConvertOp convert) +{ + extern ccl_gpu_shared OutputT shared_data[]; + + const uint tid = ccl_gpu_thread_idx_x; + const uint gridsize = blocksize * ccl_gpu_grid_dim_x(); + + OutputT sum = zero; + for (uint i = ccl_gpu_block_idx_x * blocksize + tid; i < n; i += gridsize) { + sum += convert(input_data[i]); + } + shared_data[tid] = sum; + + ccl_gpu_syncthreads(); + + if (blocksize >= 512 && tid < 256) { + shared_data[tid] = sum = sum + shared_data[tid + 256]; + } + + ccl_gpu_syncthreads(); + + if (blocksize >= 256 && tid < 128) { + shared_data[tid] = sum = sum + shared_data[tid + 128]; + } + + ccl_gpu_syncthreads(); + + if (blocksize >= 128 && tid < 64) { + shared_data[tid] = sum = sum + shared_data[tid + 64]; + } + + ccl_gpu_syncthreads(); + + if (blocksize >= 64 && tid < 32) { + shared_data[tid] = sum = sum + shared_data[tid + 32]; + } + + ccl_gpu_syncthreads(); + + if (tid < 32) { + for (int offset = ccl_gpu_warp_size / 2; offset > 0; offset /= 2) { + sum += ccl_shfl_down_sync(0xFFFFFFFF, sum, offset); + } + } + + if (tid == 0) { + output_sum[ccl_gpu_block_idx_x] = sum; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h new file mode 100644 index 00000000000..7570c5a6bbd --- /dev/null +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -0,0 +1,63 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Given an array of states, build an array of indices for which the states + * are active and sorted by a given key. The prefix sum of the number of active + * states per key must have already been computed. + * + * TODO: there may be ways to optimize this to avoid this many atomic ops? */ + +#include "util/util_atomic.h" + +#ifdef __HIP__ +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 +#endif +#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) + +template<uint blocksize, typename GetKeyOp> +__device__ void gpu_parallel_sorted_index_array(const uint num_states, + const int num_states_limit, + int *indices, + int *num_indices, + int *key_counter, + int *key_prefix_sum, + GetKeyOp get_key_op) +{ + const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x; + const int key = (state_index < num_states) ? get_key_op(state_index) : + GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; + + if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) { + const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1); + if (index < num_states_limit) { + /* Assign state index. */ + indices[index] = state_index; + } + else { + /* Can't process this state now, increase the counter again so that + * it will be handled in another iteration. */ + atomic_fetch_and_add_uint32(&key_counter[key], 1); + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h new file mode 100644 index 00000000000..089976d84e4 --- /dev/null +++ b/intern/cycles/kernel/device/hip/compat.h @@ -0,0 +1,120 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define __KERNEL_GPU__ +#define __KERNEL_HIP__ +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#ifndef ATTR_FALLTHROUGH +# define ATTR_FALLTHROUGH +#endif + +#ifdef __HIPCC_RTC__ +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +#else +# include <stdint.h> +#endif + +#ifdef CYCLES_HIPBIN_CC +# define FLT_MIN 1.175494350822287507969e-38f +# define FLT_MAX 340282346638528859811704183484516925440.0f +# define FLT_EPSILON 1.192092896e-07F +#endif + +/* Qualifiers */ + +#define ccl_device __device__ __inline__ +#define ccl_device_inline __device__ __inline__ +#define ccl_device_forceinline __device__ __forceinline__ +#define ccl_device_noinline __device__ __noinline__ +#define ccl_device_noinline_cpu ccl_device +#define ccl_global +#define ccl_static_constant __constant__ +#define ccl_device_constant __constant__ __device__ +#define ccl_constant const +#define ccl_gpu_shared __shared__ +#define ccl_private +#define ccl_may_alias +#define ccl_restrict __restrict__ +#define ccl_loop_no_unroll +#define ccl_align(n) __align__(n) +#define ccl_optional_struct_init + +#define kernel_assert(cond) + +/* Types */ +#ifdef __HIP__ +# include "hip/hip_fp16.h" +# include "hip/hip_runtime.h" +#endif + +#ifdef _MSC_VER +# include <immintrin.h> +#endif + +#define ccl_gpu_thread_idx_x (threadIdx.x) +#define ccl_gpu_block_dim_x (blockDim.x) +#define ccl_gpu_block_idx_x (blockIdx.x) +#define ccl_gpu_grid_dim_x (gridDim.x) +#define ccl_gpu_warp_size (warpSize) + +#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) +#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) + +/* GPU warp synchronization */ + +#define ccl_gpu_syncthreads() __syncthreads() +#define ccl_gpu_ballot(predicate) __ballot(predicate) +#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla) +#define ccl_gpu_popc(x) __popc(x) + +/* GPU texture objects */ +typedef hipTextureObject_t ccl_gpu_tex_object; + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj, + const float x, + const float y) +{ + return tex2D<T>(texobj, x, y); +} + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj, + const float x, + const float y, + const float z) +{ + return tex3D<T>(texobj, x, y, z); +} + +/* Use fast math functions */ + +#define cosf(x) __cosf(((float)(x))) +#define sinf(x) __sinf(((float)(x))) +#define powf(x, y) __powf(((float)(x)), ((float)(y))) +#define tanf(x) __tanf(((float)(x))) +#define logf(x) __logf(((float)(x))) +#define expf(x) __expf(((float)(x))) + +/* Types */ + +#include "util/util_half.h" +#include "util/util_types.h" diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h new file mode 100644 index 00000000000..2fde0d46015 --- /dev/null +++ b/intern/cycles/kernel/device/hip/config.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Device data taken from HIP occupancy calculator. + * + * Terminology + * - HIP GPUs have multiple streaming multiprocessors + * - Each multiprocessor executes multiple thread blocks + * - Each thread block contains a number of threads, also known as the block size + * - Multiprocessors have a fixed number of registers, and the amount of registers + * used by each threads limits the number of threads per block. + */ + +/* Launch Bound Definitions */ +#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64 +#define GPU_BLOCK_MAX_THREADS 1024 +#define GPU_THREAD_MAX_REGISTERS 255 + +#define GPU_KERNEL_BLOCK_NUM_THREADS 1024 +#define GPU_KERNEL_MAX_REGISTERS 64 + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ + +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ + extern "C" __global__ void __launch_bounds__(block_num_threads, \ + GPU_MULTIPRESSOR_MAX_REGISTERS / \ + (block_num_threads * thread_num_registers)) + +/* sanity checks */ + +#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \ + GPU_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h new file mode 100644 index 00000000000..28e1cc4282f --- /dev/null +++ b/intern/cycles/kernel/device/hip/globals.h @@ -0,0 +1,49 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#pragma once + +#include "kernel/kernel_profiling.h" +#include "kernel/kernel_types.h" + +#include "kernel/integrator/integrator_state.h" + +CCL_NAMESPACE_BEGIN + +/* Not actually used, just a NULL pointer that gets passed everywhere, which we + * hope gets optimized out by the compiler. */ +struct KernelGlobalsGPU { + int unused[1]; +}; +typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals; + +/* Global scene data and textures */ +__constant__ KernelData __data; +#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name; +#include "kernel/kernel_textures.h" + +/* Integrator state */ +__constant__ IntegratorStateGPU __integrator_state; + +/* Abstraction macros */ +#define kernel_data __data +#define kernel_tex_fetch(t, index) t[(index)] +#define kernel_tex_array(t) (t) +#define kernel_integrator_state __integrator_state + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/hip/kernel.cpp b/intern/cycles/kernel/device/hip/kernel.cpp new file mode 100644 index 00000000000..c801320a2e1 --- /dev/null +++ b/intern/cycles/kernel/device/hip/kernel.cpp @@ -0,0 +1,28 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* HIP kernel entry points */ + +#ifdef __HIP_DEVICE_COMPILE__ + +# include "kernel/device/hip/compat.h" +# include "kernel/device/hip/config.h" +# include "kernel/device/hip/globals.h" + +# include "kernel/device/gpu/image.h" +# include "kernel/device/gpu/kernel.h" + +#endif diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h new file mode 100644 index 00000000000..77cea30914c --- /dev/null +++ b/intern/cycles/kernel/device/metal/compat.h @@ -0,0 +1,126 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define __KERNEL_GPU__ +#define __KERNEL_METAL__ +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#ifndef ATTR_FALLTHROUGH +# define ATTR_FALLTHROUGH +#endif + +#include <metal_atomic> +#include <metal_pack> +#include <metal_stdlib> +#include <simd/simd.h> + +using namespace metal; + +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wsign-compare" + +/* Qualifiers */ + +#define ccl_device +#define ccl_device_inline ccl_device +#define ccl_device_forceinline ccl_device +#define ccl_device_noinline ccl_device __attribute__((noinline)) +#define ccl_device_noinline_cpu ccl_device +#define ccl_global device +#define ccl_static_constant static constant constexpr +#define ccl_device_constant constant +#define ccl_constant const device +#define ccl_gpu_shared threadgroup +#define ccl_private thread +#define ccl_may_alias +#define ccl_restrict __restrict +#define ccl_loop_no_unroll +#define ccl_align(n) alignas(n) +#define ccl_optional_struct_init + +/* No assert supported for Metal */ + +#define kernel_assert(cond) + +/* make_type definitions with Metal style element initializers */ +#ifdef make_float2 +# undef make_float2 +#endif +#ifdef make_float3 +# undef make_float3 +#endif +#ifdef make_float4 +# undef make_float4 +#endif +#ifdef make_int2 +# undef make_int2 +#endif +#ifdef make_int3 +# undef make_int3 +#endif +#ifdef make_int4 +# undef make_int4 +#endif +#ifdef make_uchar4 +# undef make_uchar4 +#endif + +#define make_float2(x, y) float2(x, y) +#define make_float3(x, y, z) float3(x, y, z) +#define make_float4(x, y, z, w) float4(x, y, z, w) +#define make_int2(x, y) int2(x, y) +#define make_int3(x, y, z) int3(x, y, z) +#define make_int4(x, y, z, w) int4(x, y, z, w) +#define make_uchar4(x, y, z, w) uchar4(x, y, z, w) + +/* Math functions */ + +#define __uint_as_float(x) as_type<float>(x) +#define __float_as_uint(x) as_type<uint>(x) +#define __int_as_float(x) as_type<float>(x) +#define __float_as_int(x) as_type<int>(x) +#define __float2half(x) half(x) +#define powf(x, y) pow(float(x), float(y)) +#define fabsf(x) fabs(float(x)) +#define copysignf(x, y) copysign(float(x), float(y)) +#define asinf(x) asin(float(x)) +#define acosf(x) acos(float(x)) +#define atanf(x) atan(float(x)) +#define floorf(x) floor(float(x)) +#define ceilf(x) ceil(float(x)) +#define hypotf(x, y) hypot(float(x), float(y)) +#define atan2f(x, y) atan2(float(x), float(y)) +#define fmaxf(x, y) fmax(float(x), float(y)) +#define fminf(x, y) fmin(float(x), float(y)) +#define fmodf(x, y) fmod(float(x), float(y)) +#define sinhf(x) sinh(float(x)) +#define coshf(x) cosh(float(x)) +#define tanhf(x) tanh(float(x)) + +/* Use native functions with possibly lower precision for performance, + * no issues found so far. */ +#define trigmode fast +#define sinf(x) trigmode::sin(float(x)) +#define cosf(x) trigmode::cos(float(x)) +#define tanf(x) trigmode::tan(float(x)) +#define expf(x) trigmode::exp(float(x)) +#define sqrtf(x) trigmode::sqrt(float(x)) +#define logf(x) trigmode::log(float(x)) + +#define NULL 0 diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h new file mode 100644 index 00000000000..d27b7d55475 --- /dev/null +++ b/intern/cycles/kernel/device/optix/compat.h @@ -0,0 +1,133 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define OPTIX_DONT_INCLUDE_CUDA +#include <optix.h> + +#define __KERNEL_GPU__ +#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */ +#define __KERNEL_OPTIX__ +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#ifndef ATTR_FALLTHROUGH +# define ATTR_FALLTHROUGH +#endif + +/* Manual definitions so we can compile without CUDA toolkit. */ + +#ifdef __CUDACC_RTC__ +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +#else +# include <stdint.h> +#endif + +#ifdef CYCLES_CUBIN_CC +# define FLT_MIN 1.175494350822287507969e-38f +# define FLT_MAX 340282346638528859811704183484516925440.0f +# define FLT_EPSILON 1.192092896e-07F +#endif + +#define ccl_device \ + __device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything +#define ccl_device_inline ccl_device +#define ccl_device_forceinline ccl_device +#define ccl_device_noinline __device__ __noinline__ +#define ccl_device_noinline_cpu ccl_device +#define ccl_global +#define ccl_static_constant __constant__ +#define ccl_device_constant __constant__ __device__ +#define ccl_constant const +#define ccl_gpu_shared __shared__ +#define ccl_private +#define ccl_may_alias +#define ccl_restrict __restrict__ +#define ccl_loop_no_unroll +#define ccl_align(n) __align__(n) + +/* Zero initialize structs to help the compiler figure out scoping */ +#define ccl_optional_struct_init = {} + +/* No assert supported for CUDA */ + +#define kernel_assert(cond) + +/* GPU thread, block, grid size and index */ + +#define ccl_gpu_thread_idx_x (threadIdx.x) +#define ccl_gpu_block_dim_x (blockDim.x) +#define ccl_gpu_block_idx_x (blockIdx.x) +#define ccl_gpu_grid_dim_x (gridDim.x) +#define ccl_gpu_warp_size (warpSize) + +#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) +#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) + +/* GPU warp synchronization. */ + +#define ccl_gpu_syncthreads() __syncthreads() +#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) +#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) +#define ccl_gpu_popc(x) __popc(x) + +/* GPU texture objects */ + +typedef unsigned long long CUtexObject; +typedef CUtexObject ccl_gpu_tex_object; + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj, + const float x, + const float y) +{ + return tex2D<T>(texobj, x, y); +} + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj, + const float x, + const float y, + const float z) +{ + return tex3D<T>(texobj, x, y, z); +} + +/* Half */ + +typedef unsigned short half; + +__device__ half __float2half(const float f) +{ + half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); + return val; +} + +__device__ float __half2float(const half h) +{ + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h)); + return val; +} + +/* Types */ + +#include "util/util_half.h" +#include "util/util_types.h" diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h new file mode 100644 index 00000000000..7b8ebfe50e6 --- /dev/null +++ b/intern/cycles/kernel/device/optix/globals.h @@ -0,0 +1,60 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#pragma once + +#include "kernel/kernel_profiling.h" +#include "kernel/kernel_types.h" + +#include "kernel/integrator/integrator_state.h" + +CCL_NAMESPACE_BEGIN + +/* Not actually used, just a NULL pointer that gets passed everywhere, which we + * hope gets optimized out by the compiler. */ +struct KernelGlobalsGPU { + int unused[1]; +}; +typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals; + +/* Launch parameters */ +struct KernelParamsOptiX { + /* Kernel arguments */ + const int *path_index_array; + float *render_buffer; + + /* Global scene data and textures */ + KernelData data; +#define KERNEL_TEX(type, name) const type *name; +#include "kernel/kernel_textures.h" + + /* Integrator state */ + IntegratorStateGPU __integrator_state; +}; + +#ifdef __NVCC__ +extern "C" static __constant__ KernelParamsOptiX __params; +#endif + +/* Abstraction macros */ +#define kernel_data __params.data +#define kernel_tex_array(t) __params.t +#define kernel_tex_fetch(t, index) __params.t[(index)] +#define kernel_integrator_state __params.__integrator_state + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu new file mode 100644 index 00000000000..a3bafb9846c --- /dev/null +++ b/intern/cycles/kernel/device/optix/kernel.cu @@ -0,0 +1,406 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// clang-format off +#include "kernel/device/optix/compat.h" +#include "kernel/device/optix/globals.h" + +#include "kernel/device/gpu/image.h" /* Texture lookup uses normal CUDA intrinsics. */ + +#include "kernel/integrator/integrator_state.h" +#include "kernel/integrator/integrator_state_flow.h" +#include "kernel/integrator/integrator_state_util.h" + +#include "kernel/integrator/integrator_intersect_closest.h" +#include "kernel/integrator/integrator_intersect_shadow.h" +#include "kernel/integrator/integrator_intersect_subsurface.h" +#include "kernel/integrator/integrator_intersect_volume_stack.h" + +// clang-format on + +template<typename T> ccl_device_forceinline T *get_payload_ptr_0() +{ + return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1()); +} +template<typename T> ccl_device_forceinline T *get_payload_ptr_2() +{ + return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3()); +} + +ccl_device_forceinline int get_object_id() +{ +#ifdef __OBJECT_MOTION__ + /* Always get the the instance ID from the TLAS + * There might be a motion transform node between TLAS and BLAS which does not have one. */ + return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0)); +#else + return optixGetInstanceId(); +#endif +} + +extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_intersect_closest(nullptr, path_index); +} + +extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_intersect_shadow(nullptr, path_index); +} + +extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_intersect_subsurface(nullptr, path_index); +} + +extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_intersect_volume_stack(nullptr, path_index); +} + +extern "C" __global__ void __miss__kernel_optix_miss() +{ + /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */ + optixSetPayload_0(__float_as_uint(optixGetRayTmax())); + optixSetPayload_5(PRIMITIVE_NONE); +} + +extern "C" __global__ void __anyhit__kernel_optix_local_hit() +{ +#ifdef __HAIR__ + if (!optixIsTriangleHit()) { + /* Ignore curves. */ + return optixIgnoreIntersection(); + } +#endif + +#ifdef __BVH_LOCAL__ + const int object = get_object_id(); + if (object != optixGetPayload_4() /* local_object */) { + /* Only intersect with matching object. */ + return optixIgnoreIntersection(); + } + + const uint max_hits = optixGetPayload_5(); + if (max_hits == 0) { + /* Special case for when no hit information is requested, just report that something was hit */ + optixSetPayload_5(true); + return optixTerminateRay(); + } + + int hit = 0; + uint *const lcg_state = get_payload_ptr_0<uint>(); + LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>(); + + if (lcg_state) { + for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) { + if (optixGetRayTmax() == local_isect->hits[i].t) { + return optixIgnoreIntersection(); + } + } + + hit = local_isect->num_hits++; + + if (local_isect->num_hits > max_hits) { + hit = lcg_step_uint(lcg_state) % local_isect->num_hits; + if (hit >= max_hits) { + return optixIgnoreIntersection(); + } + } + } + else { + if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) { + /* Record closest intersection only. + * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit. + */ + return optixIgnoreIntersection(); + } + + local_isect->num_hits = 1; + } + + const int prim = optixGetPrimitiveIndex(); + + Intersection *isect = &local_isect->hits[hit]; + isect->t = optixGetRayTmax(); + isect->prim = prim; + isect->object = get_object_id(); + isect->type = kernel_tex_fetch(__objects, isect->object).primitive_type; + + const float2 barycentrics = optixGetTriangleBarycentrics(); + isect->u = 1.0f - barycentrics.y - barycentrics.x; + isect->v = barycentrics.x; + + /* Record geometric normal. */ + const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w; + const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)); + const float3 tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)); + const float3 tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2)); + local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); + + /* Continue tracing (without this the trace call would return after the first hit). */ + optixIgnoreIntersection(); +#endif +} + +extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() +{ +#ifdef __SHADOW_RECORD_ALL__ + int prim = optixGetPrimitiveIndex(); + const uint object = get_object_id(); +# ifdef __VISIBILITY_FLAG__ + const uint visibility = optixGetPayload_4(); + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + return optixIgnoreIntersection(); + } +# endif + + float u = 0.0f, v = 0.0f; + int type = 0; + if (optixIsTriangleHit()) { + const float2 barycentrics = optixGetTriangleBarycentrics(); + u = 1.0f - barycentrics.y - barycentrics.x; + v = barycentrics.x; + type = kernel_tex_fetch(__objects, object).primitive_type; + } +# ifdef __HAIR__ + else { + u = __uint_as_float(optixGetAttribute_0()); + v = __uint_as_float(optixGetAttribute_1()); + + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim); + type = segment.type; + prim = segment.prim; + + /* Filter out curve endcaps. */ + if (u == 0.0f || u == 1.0f) { + return optixIgnoreIntersection(); + } + } +# endif + +# ifndef __TRANSPARENT_SHADOWS__ + /* No transparent shadows support compiled in, make opaque. */ + optixSetPayload_5(true); + return optixTerminateRay(); +# else + const uint max_hits = optixGetPayload_3(); + const uint num_hits_packed = optixGetPayload_2(); + const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed); + const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed); + + /* If no transparent shadows, all light is blocked and we can stop immediately. */ + if (num_hits >= max_hits || + !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) { + optixSetPayload_5(true); + return optixTerminateRay(); + } + + /* Always use baked shadow transparency for curves. */ + if (type & PRIMITIVE_ALL_CURVE) { + float throughput = __uint_as_float(optixGetPayload_1()); + throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u); + optixSetPayload_1(__float_as_uint(throughput)); + optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1)); + + if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) { + optixSetPayload_4(true); + return optixTerminateRay(); + } + else { + /* Continue tracing. */ + optixIgnoreIntersection(); + return; + } + } + + /* Record transparent intersection. */ + optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1)); + + uint record_index = num_recorded_hits; + + const IntegratorShadowState state = optixGetPayload_0(); + + const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE); + if (record_index >= max_record_hits) { + /* If maximum number of hits reached, find a hit to replace. */ + float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t); + uint max_recorded_hit = 0; + + for (int i = 1; i < max_record_hits; i++) { + const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t); + if (isect_t > max_recorded_t) { + max_recorded_t = isect_t; + max_recorded_hit = i; + } + } + + if (optixGetRayTmax() >= max_recorded_t) { + /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the + * current hit anymore. */ + return; + } + + record_index = max_recorded_hit; + } + + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax(); + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type; + + /* Continue tracing. */ + optixIgnoreIntersection(); +# endif /* __TRANSPARENT_SHADOWS__ */ +#endif /* __SHADOW_RECORD_ALL__ */ +} + +extern "C" __global__ void __anyhit__kernel_optix_volume_test() +{ +#ifdef __HAIR__ + if (!optixIsTriangleHit()) { + /* Ignore curves. */ + return optixIgnoreIntersection(); + } +#endif + + const uint object = get_object_id(); +#ifdef __VISIBILITY_FLAG__ + const uint visibility = optixGetPayload_4(); + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + return optixIgnoreIntersection(); + } +#endif + + if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) { + return optixIgnoreIntersection(); + } +} + +extern "C" __global__ void __anyhit__kernel_optix_visibility_test() +{ +#ifdef __HAIR__ + if (!optixIsTriangleHit()) { + /* Filter out curve endcaps. */ + const float u = __uint_as_float(optixGetAttribute_0()); + if (u == 0.0f || u == 1.0f) { + return optixIgnoreIntersection(); + } + } +#endif + +#ifdef __VISIBILITY_FLAG__ + const uint object = get_object_id(); + const uint visibility = optixGetPayload_4(); + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + return optixIgnoreIntersection(); + } + + /* Shadow ray early termination. */ + if (visibility & PATH_RAY_SHADOW_OPAQUE) { + return optixTerminateRay(); + } +#endif +} + +extern "C" __global__ void __closesthit__kernel_optix_hit() +{ + const int object = get_object_id(); + const int prim = optixGetPrimitiveIndex(); + + optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */ + optixSetPayload_4(object); + + if (optixIsTriangleHit()) { + const float2 barycentrics = optixGetTriangleBarycentrics(); + optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x)); + optixSetPayload_2(__float_as_uint(barycentrics.x)); + optixSetPayload_3(prim); + optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type); + } + else { + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim); + optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */ + optixSetPayload_2(optixGetAttribute_1()); + optixSetPayload_3(segment.prim); + optixSetPayload_5(segment.type); + } +} + +#ifdef __HAIR__ +ccl_device_inline void optix_intersection_curve(const int prim, const int type) +{ + const int object = get_object_id(); + +# ifdef __VISIBILITY_FLAG__ + const uint visibility = optixGetPayload_4(); + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + return; + } +# endif + + float3 P = optixGetObjectRayOrigin(); + float3 dir = optixGetObjectRayDirection(); + + /* The direction is not normalized by default, but the curve intersection routine expects that */ + float len; + dir = normalize_len(dir, &len); + +# ifdef __OBJECT_MOTION__ + const float time = optixGetRayTime(); +# else + const float time = 0.0f; +# endif + + Intersection isect; + isect.t = optixGetRayTmax(); + /* Transform maximum distance into object space. */ + if (isect.t != FLT_MAX) + isect.t *= len; + + if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + optixReportIntersection(isect.t / len, + type & PRIMITIVE_ALL, + __float_as_int(isect.u), /* Attribute_0 */ + __float_as_int(isect.v)); /* Attribute_1 */ + } +} + +extern "C" __global__ void __intersection__curve_ribbon() +{ + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, optixGetPrimitiveIndex()); + const int prim = segment.prim; + const int type = segment.type; + if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) { + optix_intersection_curve(prim, type); + } +} +#endif diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu new file mode 100644 index 00000000000..bf787e29eaa --- /dev/null +++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu @@ -0,0 +1,29 @@ +/* + * Copyright 2021, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Copy of the regular kernels with additional shader ray-tracing kernel that takes + * much longer to compiler. This is only loaded when needed by the scene. */ + +#include "kernel/device/optix/kernel.cu" +#include "kernel/integrator/integrator_shade_surface.h" + +extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer); +} |