Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Jones <michael_p_jones@apple.com>2021-11-18 16:25:05 +0300
committerBrecht Van Lommel <brecht@blender.org>2021-11-18 16:38:02 +0300
commitd19e35873f67c90b251ca38e007a83aa1eada211 (patch)
tree7e8659acea7f12b188077ada225e113b5df35e60 /intern/cycles/kernel
parentc0d52db783eb3a6288c9af04298b2358fec76357 (diff)
Cycles: several small fixes and additions for MSL
This patch contains many small leftover fixes and additions that are required for Metal-enablement: - Address space fixes and a few other small compile fixes - Addition of missing functionality to the Metal adapter headers - Addition of various scattered `__KERNEL_METAL__` blocks (e.g. for atomic support & maths functions) Ref T92212 Differential Revision: https://developer.blender.org/D13263
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r--intern/cycles/kernel/bvh/util.h2
-rw-r--r--intern/cycles/kernel/device/cuda/compat.h1
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h4
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h4
-rw-r--r--intern/cycles/kernel/device/hip/compat.h1
-rw-r--r--intern/cycles/kernel/device/metal/compat.h10
-rw-r--r--intern/cycles/kernel/device/metal/globals.h2
-rw-r--r--intern/cycles/kernel/device/optix/compat.h1
-rw-r--r--intern/cycles/kernel/film/accumulate.h3
-rw-r--r--intern/cycles/kernel/geom/attribute.h7
-rw-r--r--intern/cycles/kernel/geom/subd_triangle.h7
-rw-r--r--intern/cycles/kernel/sample/lcg.h6
-rw-r--r--intern/cycles/kernel/sample/pattern.h13
-rw-r--r--intern/cycles/kernel/svm/svm.h2
14 files changed, 28 insertions, 35 deletions
diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h
index 8686f887021..26ba136dd79 100644
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
swapped = false;
for (int j = 0; j < num_hits - 1; ++j) {
if (hits[j].t > hits[j + 1].t) {
- struct Intersection tmp_hit = hits[j];
+ Intersection tmp_hit = hits[j];
float3 tmp_Ng = Ng[j];
hits[j] = hits[j + 1];
Ng[j] = Ng[j + 1];
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index ba3aefa43bf..7f901510329 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -86,7 +86,6 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_syncthreads() __syncthreads()
#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
/* GPU texture objects */
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index dd0c6dd6893..60332af752c 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -464,7 +464,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
if (lane_id == 0) {
- atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask));
+ atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
}
}
@@ -892,6 +892,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const auto can_split_mask = ccl_gpu_ballot(can_split);
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
if (lane_id == 0) {
- atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask));
+ atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
}
}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index f667ede2712..a5320edcb3c 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -85,8 +85,8 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
/* For each thread within a warp compute how many other active states precede it. */
- const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
- ccl_gpu_thread_mask(thread_warp));
+ const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+ ccl_gpu_thread_mask(thread_warp));
/* Last thread in warp stores number of active states for each warp. */
if (thread_warp == ccl_gpu_warp_size - 1) {
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index b58179e12ff..39bf2131c22 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -85,7 +85,6 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_syncthreads() __syncthreads()
#define ccl_gpu_ballot(predicate) __ballot(predicate)
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
-#define ccl_gpu_popc(x) __popc(x)
/* GPU texture objects */
typedef hipTextureObject_t ccl_gpu_tex_object;
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index 19358e063d8..080109e3b83 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -34,6 +34,7 @@ using namespace metal;
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wuninitialized"
/* Qualifiers */
@@ -65,7 +66,7 @@ using namespace metal;
#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
-#define ccl_gpu_popc(x) popcount(x)
+#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);
// clang-format off
@@ -124,7 +125,6 @@ kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
uint simd_group_index [[simdgroup_index_in_threadgroup]], \
uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
- INIT_DEBUG_BUFFER \
params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
} \
void kernel_gpu_##name::run(thread MetalKernelContext& context, \
@@ -230,6 +230,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
#define sinhf(x) sinh(float(x))
#define coshf(x) cosh(float(x))
#define tanhf(x) tanh(float(x))
+#define saturatef(x) saturate(float(x))
/* Use native functions with possibly lower precision for performance,
* no issues found so far. */
@@ -243,6 +244,8 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
#define NULL 0
+#define __device__
+
/* texture bindings and sampler setup */
struct Texture2DParamsMetal {
@@ -257,6 +260,9 @@ struct MetalAncillaries {
device Texture3DParamsMetal *textures_3d;
};
+#include "util/half.h"
+#include "util/types.h"
+
enum SamplerType {
SamplerFilterNearest_AddressRepeat,
SamplerFilterNearest_AddressClampEdge,
diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h
index b4963518b63..1aea36589d0 100644
--- a/intern/cycles/kernel/device/metal/globals.h
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
typedef struct KernelParamsMetal {
-#define KERNEL_TEX(type, name) ccl_constant type *name;
+#define KERNEL_TEX(type, name) ccl_global const type *name;
#include "kernel/textures.h"
#undef KERNEL_TEX
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index c7a7be7309a..bebb1e458eb 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -87,7 +87,6 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_syncthreads() __syncthreads()
#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
/* GPU texture objects */
diff --git a/intern/cycles/kernel/film/accumulate.h b/intern/cycles/kernel/film/accumulate.h
index c9303088e3f..9ee0d27cc8c 100644
--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/accumulate.h
@@ -160,7 +160,8 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
- return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
+ return atomic_fetch_and_add_uint32(
+ (ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
sample_offset;
}
diff --git a/intern/cycles/kernel/geom/attribute.h b/intern/cycles/kernel/geom/attribute.h
index ae96e7b76ef..a7ac2bd926f 100644
--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@@ -27,7 +27,12 @@ CCL_NAMESPACE_BEGIN
* Lookup of attributes is different between OSL and SVM, as OSL is ustring
* based while for SVM we use integer ids. */
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd);
+/* Patch index for triangle, -1 if not subdivision triangle */
+
+ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+ return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
+}
ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
{
diff --git a/intern/cycles/kernel/geom/subd_triangle.h b/intern/cycles/kernel/geom/subd_triangle.h
index e3b5c9afb91..1b693a915bf 100644
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@@ -20,13 +20,6 @@
CCL_NAMESPACE_BEGIN
-/* Patch index for triangle, -1 if not subdivision triangle */
-
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
- return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
-}
-
/* UV coords of triangle within patch */
ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg,
diff --git a/intern/cycles/kernel/sample/lcg.h b/intern/cycles/kernel/sample/lcg.h
index 92cfff639b4..e8c4915813e 100644
--- a/intern/cycles/kernel/sample/lcg.h
+++ b/intern/cycles/kernel/sample/lcg.h
@@ -19,14 +19,16 @@ CCL_NAMESPACE_BEGIN
/* Linear Congruential Generator */
-ccl_device uint lcg_step_uint(uint *rng)
+/* This is templated to handle multiple address spaces on Metal. */
+template<class T> ccl_device uint lcg_step_uint(T rng)
{
/* implicit mod 2^32 */
*rng = (1103515245 * (*rng) + 12345);
return *rng;
}
-ccl_device float lcg_step_float(uint *rng)
+/* This is templated to handle multiple address spaces on Metal. */
+template<class T> ccl_device float lcg_step_float(T rng)
{
/* implicit mod 2^32 */
*rng = (1103515245 * (*rng) + 12345);
diff --git a/intern/cycles/kernel/sample/pattern.h b/intern/cycles/kernel/sample/pattern.h
index 0c27992c7f6..adc8493badd 100644
--- a/intern/cycles/kernel/sample/pattern.h
+++ b/intern/cycles/kernel/sample/pattern.h
@@ -163,18 +163,7 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
/* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al.
* We can use this to get divide sample sequence into two classes for easier variance
* estimation. */
-#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
- return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
-#elif defined(__NVCC__)
- return __popc(sample & 0xaaaaaaaa) & 1;
-#else
- /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
- int i = sample & 0xaaaaaaaa;
- i = i - ((i >> 1) & 0x55555555);
- i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
- i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
- return i & 1;
-#endif
+ return popcount(uint(sample) & 0xaaaaaaaa) & 1;
}
else {
/* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 62ba5bf04e3..ce32e1a520f 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -220,7 +220,7 @@ CCL_NAMESPACE_BEGIN
template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
ccl_device void svm_eval_nodes(KernelGlobals kg,
ConstIntegratorGenericState state,
- ShaderData *sd,
+ ccl_private ShaderData *sd,
ccl_global float *render_buffer,
uint32_t path_flag)
{