diff options
author | Michael Jones <michael_p_jones@apple.com> | 2021-11-18 16:25:05 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2021-11-18 16:38:02 +0300 |
commit | d19e35873f67c90b251ca38e007a83aa1eada211 (patch) | |
tree | 7e8659acea7f12b188077ada225e113b5df35e60 /intern/cycles/kernel | |
parent | c0d52db783eb3a6288c9af04298b2358fec76357 (diff) |
Cycles: several small fixes and additions for MSL
This patch contains many small leftover fixes and additions that are
required for Metal-enablement:
- Address space fixes and a few other small compile fixes
- Addition of missing functionality to the Metal adapter headers
- Addition of various scattered `__KERNEL_METAL__` blocks (e.g. for
atomic support & maths functions)
Ref T92212
Differential Revision: https://developer.blender.org/D13263
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r-- | intern/cycles/kernel/bvh/util.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/device/cuda/compat.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/device/gpu/kernel.h | 4 | ||||
-rw-r--r-- | intern/cycles/kernel/device/gpu/parallel_active_index.h | 4 | ||||
-rw-r--r-- | intern/cycles/kernel/device/hip/compat.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/device/metal/compat.h | 10 | ||||
-rw-r--r-- | intern/cycles/kernel/device/metal/globals.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/device/optix/compat.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/film/accumulate.h | 3 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/attribute.h | 7 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/subd_triangle.h | 7 | ||||
-rw-r--r-- | intern/cycles/kernel/sample/lcg.h | 6 | ||||
-rw-r--r-- | intern/cycles/kernel/sample/pattern.h | 13 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm.h | 2 |
14 files changed, 28 insertions, 35 deletions
diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h index 8686f887021..26ba136dd79 100644 --- a/intern/cycles/kernel/bvh/util.h +++ b/intern/cycles/kernel/bvh/util.h @@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection * swapped = false; for (int j = 0; j < num_hits - 1; ++j) { if (hits[j].t > hits[j + 1].t) { - struct Intersection tmp_hit = hits[j]; + Intersection tmp_hit = hits[j]; float3 tmp_Ng = Ng[j]; hits[j] = hits[j + 1]; Ng[j] = Ng[j + 1]; diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h index ba3aefa43bf..7f901510329 100644 --- a/intern/cycles/kernel/device/cuda/compat.h +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -86,7 +86,6 @@ typedef unsigned long long uint64_t; #define ccl_gpu_syncthreads() __syncthreads() #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) -#define ccl_gpu_popc(x) __popc(x) /* GPU texture objects */ diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index dd0c6dd6893..60332af752c 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -464,7 +464,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const auto num_active_pixels_mask = ccl_gpu_ballot(!converged); const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; if (lane_id == 0) { - atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask)); + atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask)); } } @@ -892,6 +892,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const auto can_split_mask = ccl_gpu_ballot(can_split); const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; if (lane_id == 0) { - atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask)); + atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask)); } } diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index f667ede2712..a5320edcb3c 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -85,8 +85,8 @@ __device__ void gpu_parallel_active_index_array(const uint num_states, const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; /* For each thread within a warp compute how many other active states precede it. */ - const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & - ccl_gpu_thread_mask(thread_warp)); + const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & + ccl_gpu_thread_mask(thread_warp)); /* Last thread in warp stores number of active states for each warp. */ if (thread_warp == ccl_gpu_warp_size - 1) { diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h index b58179e12ff..39bf2131c22 100644 --- a/intern/cycles/kernel/device/hip/compat.h +++ b/intern/cycles/kernel/device/hip/compat.h @@ -85,7 +85,6 @@ typedef unsigned long long uint64_t; #define ccl_gpu_syncthreads() __syncthreads() #define ccl_gpu_ballot(predicate) __ballot(predicate) #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla) -#define ccl_gpu_popc(x) __popc(x) /* GPU texture objects */ typedef hipTextureObject_t ccl_gpu_tex_object; diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 19358e063d8..080109e3b83 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -34,6 +34,7 @@ using namespace metal; #pragma clang diagnostic ignored "-Wunused-variable" #pragma clang diagnostic ignored "-Wsign-compare" +#pragma clang diagnostic ignored "-Wuninitialized" /* Qualifiers */ @@ -65,7 +66,7 @@ using namespace metal; #define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1) #define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate))) -#define ccl_gpu_popc(x) popcount(x) +#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup); // clang-format off @@ -124,7 +125,6 @@ kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \ uint simd_group_index [[simdgroup_index_in_threadgroup]], \ uint num_simd_groups [[simdgroups_per_threadgroup]]) { \ MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \ - INIT_DEBUG_BUFFER \ params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \ } \ void kernel_gpu_##name::run(thread MetalKernelContext& context, \ @@ -230,6 +230,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \ #define sinhf(x) sinh(float(x)) #define coshf(x) cosh(float(x)) #define tanhf(x) tanh(float(x)) +#define saturatef(x) saturate(float(x)) /* Use native functions with possibly lower precision for performance, * no issues found so far. */ @@ -243,6 +244,8 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \ #define NULL 0 +#define __device__ + /* texture bindings and sampler setup */ struct Texture2DParamsMetal { @@ -257,6 +260,9 @@ struct MetalAncillaries { device Texture3DParamsMetal *textures_3d; }; +#include "util/half.h" +#include "util/types.h" + enum SamplerType { SamplerFilterNearest_AddressRepeat, SamplerFilterNearest_AddressClampEdge, diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h index b4963518b63..1aea36589d0 100644 --- a/intern/cycles/kernel/device/metal/globals.h +++ b/intern/cycles/kernel/device/metal/globals.h @@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN typedef struct KernelParamsMetal { -#define KERNEL_TEX(type, name) ccl_constant type *name; +#define KERNEL_TEX(type, name) ccl_global const type *name; #include "kernel/textures.h" #undef KERNEL_TEX diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h index c7a7be7309a..bebb1e458eb 100644 --- a/intern/cycles/kernel/device/optix/compat.h +++ b/intern/cycles/kernel/device/optix/compat.h @@ -87,7 +87,6 @@ typedef unsigned long long uint64_t; #define ccl_gpu_syncthreads() __syncthreads() #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) -#define ccl_gpu_popc(x) __popc(x) /* GPU texture objects */ diff --git a/intern/cycles/kernel/film/accumulate.h b/intern/cycles/kernel/film/accumulate.h index c9303088e3f..9ee0d27cc8c 100644 --- a/intern/cycles/kernel/film/accumulate.h +++ b/intern/cycles/kernel/film/accumulate.h @@ -160,7 +160,8 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg, ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer); - return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1) + + return atomic_fetch_and_add_uint32( + (ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) + sample_offset; } diff --git a/intern/cycles/kernel/geom/attribute.h b/intern/cycles/kernel/geom/attribute.h index ae96e7b76ef..a7ac2bd926f 100644 --- a/intern/cycles/kernel/geom/attribute.h +++ b/intern/cycles/kernel/geom/attribute.h @@ -27,7 +27,12 @@ CCL_NAMESPACE_BEGIN * Lookup of attributes is different between OSL and SVM, as OSL is ustring * based while for SVM we use integer ids. */ -ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd); +/* Patch index for triangle, -1 if not subdivision triangle */ + +ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd) +{ + return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0; +} ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd) { diff --git a/intern/cycles/kernel/geom/subd_triangle.h b/intern/cycles/kernel/geom/subd_triangle.h index e3b5c9afb91..1b693a915bf 100644 --- a/intern/cycles/kernel/geom/subd_triangle.h +++ b/intern/cycles/kernel/geom/subd_triangle.h @@ -20,13 +20,6 @@ CCL_NAMESPACE_BEGIN -/* Patch index for triangle, -1 if not subdivision triangle */ - -ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd) -{ - return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0; -} - /* UV coords of triangle within patch */ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg, diff --git a/intern/cycles/kernel/sample/lcg.h b/intern/cycles/kernel/sample/lcg.h index 92cfff639b4..e8c4915813e 100644 --- a/intern/cycles/kernel/sample/lcg.h +++ b/intern/cycles/kernel/sample/lcg.h @@ -19,14 +19,16 @@ CCL_NAMESPACE_BEGIN /* Linear Congruential Generator */ -ccl_device uint lcg_step_uint(uint *rng) +/* This is templated to handle multiple address spaces on Metal. */ +template<class T> ccl_device uint lcg_step_uint(T rng) { /* implicit mod 2^32 */ *rng = (1103515245 * (*rng) + 12345); return *rng; } -ccl_device float lcg_step_float(uint *rng) +/* This is templated to handle multiple address spaces on Metal. */ +template<class T> ccl_device float lcg_step_float(T rng) { /* implicit mod 2^32 */ *rng = (1103515245 * (*rng) + 12345); diff --git a/intern/cycles/kernel/sample/pattern.h b/intern/cycles/kernel/sample/pattern.h index 0c27992c7f6..adc8493badd 100644 --- a/intern/cycles/kernel/sample/pattern.h +++ b/intern/cycles/kernel/sample/pattern.h @@ -163,18 +163,7 @@ ccl_device_inline bool sample_is_even(int pattern, int sample) /* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al. * We can use this to get divide sample sequence into two classes for easier variance * estimation. */ -#if defined(__GNUC__) && !defined(__KERNEL_GPU__) - return __builtin_popcount(sample & 0xaaaaaaaa) & 1; -#elif defined(__NVCC__) - return __popc(sample & 0xaaaaaaaa) & 1; -#else - /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */ - int i = sample & 0xaaaaaaaa; - i = i - ((i >> 1) & 0x55555555); - i = (i & 0x33333333) + ((i >> 2) & 0x33333333); - i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; - return i & 1; -#endif + return popcount(uint(sample) & 0xaaaaaaaa) & 1; } else { /* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */ diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index 62ba5bf04e3..ce32e1a520f 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -220,7 +220,7 @@ CCL_NAMESPACE_BEGIN template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState> ccl_device void svm_eval_nodes(KernelGlobals kg, ConstIntegratorGenericState state, - ShaderData *sd, + ccl_private ShaderData *sd, ccl_global float *render_buffer, uint32_t path_flag) { |