Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/device/gpu')
-rw-r--r--intern/cycles/kernel/device/gpu/image.h4
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h14
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h100
3 files changed, 104 insertions, 14 deletions
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index 29d851ae478..a8c72645569 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -181,7 +181,7 @@ ccl_device_noinline typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_in
ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
{
- ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+ ccl_global const TextureInfo &info = kernel_data_fetch(texture_info, id);
/* float4, byte4, ushort4 and half4 */
const int texture_type = info.data_type;
@@ -216,7 +216,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
float3 P,
InterpolationType interp)
{
- ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+ ccl_global const TextureInfo &info = kernel_data_fetch(texture_info, id);
if (info.use_transform_3d) {
P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index d657571a5fa..d7d2000775f 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -14,6 +14,8 @@
#ifdef __KERNEL_METAL__
# include "kernel/device/metal/context_begin.h"
+#elif defined(__KERNEL_ONEAPI__)
+# include "kernel/device/oneapi/context_begin.h"
#endif
#include "kernel/device/gpu/work_stealing.h"
@@ -40,6 +42,8 @@
#ifdef __KERNEL_METAL__
# include "kernel/device/metal/context_end.h"
+#elif defined(__KERNEL_ONEAPI__)
+# include "kernel/device/oneapi/context_end.h"
#endif
#include "kernel/film/read.h"
@@ -242,7 +246,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
ccl_gpu_kernel_postfix
#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
-constant int __dummy_constant [[function_constant(0)]];
+constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
#endif
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
@@ -522,7 +526,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
bool converged = true;
if (x < sw && y < sh) {
- converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+ converged = ccl_gpu_kernel_call(film_adaptive_sampling_convergence_check(
nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
}
@@ -549,7 +553,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
if (y < sh) {
ccl_gpu_kernel_call(
- kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
+ film_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
}
}
ccl_gpu_kernel_postfix
@@ -568,7 +572,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
if (x < sw) {
ccl_gpu_kernel_call(
- kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
+ film_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
}
}
ccl_gpu_kernel_postfix
@@ -585,7 +589,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int pixel_index = ccl_gpu_global_id_x();
if (pixel_index < num_pixels) {
- ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
+ ccl_gpu_kernel_call(film_cryptomatte_post(nullptr, render_buffer, pixel_index));
}
}
ccl_gpu_kernel_postfix
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index 7d7266d5edf..c1df49c4f49 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -18,15 +18,68 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
-#ifndef __KERNEL_METAL__
+/* TODO: abstract more device differences, define ccl_gpu_local_syncthreads,
+ * ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices
+ * and keep device specific code in compat.h */
+
+#ifdef __KERNEL_ONEAPI__
+# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+template<typename IsActiveOp>
+void cpu_serial_active_index_array_impl(const uint num_states,
+ ccl_global int *ccl_restrict indices,
+ ccl_global int *ccl_restrict num_indices,
+ IsActiveOp is_active_op)
+{
+ int write_index = 0;
+ for (int state_index = 0; state_index < num_states; state_index++) {
+ if (is_active_op(state_index))
+ indices[write_index++] = state_index;
+ }
+ *num_indices = write_index;
+ return;
+}
+# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
+
+template<typename IsActiveOp>
+void gpu_parallel_active_index_array_impl(const uint num_states,
+ ccl_global int *ccl_restrict indices,
+ ccl_global int *ccl_restrict num_indices,
+ IsActiveOp is_active_op)
+{
+ const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
+ const uint blocksize = item_id.get_local_range(0);
+
+ sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1],
+ sycl::access::address_space::local_space>
+ ptr = sycl::ext::oneapi::group_local_memory<
+ int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group());
+ int *warp_offset = *ptr;
+
+ /* NOTE(@nsirgien): Here we calculate the same value as below but
+ * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into
+ * something faster already but DPC++ doesn't, so it's better to use
+ * direct request of needed parameters - switching from this computation to computation below
+ * will cause 2.5x performance slowdown. */
+ const uint thread_index = item_id.get_local_id(0);
+ const uint thread_warp = item_id.get_sub_group().get_local_id();
+
+ const uint warp_index = item_id.get_sub_group().get_group_id();
+ const uint num_warps = item_id.get_sub_group().get_group_range()[0];
+
+ const uint state_index = item_id.get_global_id(0);
+
+ /* Test if state corresponding to this thread is active. */
+ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#else /* !__KERNEL__ONEAPI__ */
+# ifndef __KERNEL_METAL__
template<uint blocksize, typename IsActiveOp>
__device__
-#endif
+# endif
void
gpu_parallel_active_index_array_impl(const uint num_states,
ccl_global int *indices,
ccl_global int *num_indices,
-#ifdef __KERNEL_METAL__
+# ifdef __KERNEL_METAL__
const uint is_active,
const uint blocksize,
const int thread_index,
@@ -37,7 +90,7 @@ __device__
const int num_warps,
threadgroup int *warp_offset)
{
-#else
+# else
IsActiveOp is_active_op)
{
extern ccl_gpu_shared int warp_offset[];
@@ -52,18 +105,33 @@ __device__
/* Test if state corresponding to this thread is active. */
const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
-#endif
-
+# endif
+#endif /* !__KERNEL_ONEAPI__ */
/* For each thread within a warp compute how many other active states precede it. */
+#ifdef __KERNEL_ONEAPI__
+ const uint thread_offset = sycl::exclusive_scan_over_group(
+ item_id.get_sub_group(), is_active, std::plus<>());
+#else
const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
ccl_gpu_thread_mask(thread_warp));
+#endif
/* Last thread in warp stores number of active states for each warp. */
+#ifdef __KERNEL_ONEAPI__
+ if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
+#else
if (thread_warp == ccl_gpu_warp_size - 1) {
+#endif
warp_offset[warp_index] = thread_offset + is_active;
}
+#ifdef __KERNEL_ONEAPI__
+ /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important,
+ * so faster local barriers can be used. */
+ ccl_gpu_local_syncthreads();
+#else
ccl_gpu_syncthreads();
+#endif
/* Last thread in block converts per-warp sizes to offsets, increments global size of
* index array and gets offset to write to. */
@@ -80,7 +148,13 @@ __device__
warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
}
+#ifdef __KERNEL_ONEAPI__
+ /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset),
+ * so faster local barriers can be used. */
+ ccl_gpu_local_syncthreads();
+#else
ccl_gpu_syncthreads();
+#endif
/* Write to index array. */
if (is_active) {
@@ -107,7 +181,19 @@ __device__
simd_group_index, \
num_simd_groups, \
simdgroup_offset)
-
+#elif defined(__KERNEL_ONEAPI__)
+# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+# define gpu_parallel_active_index_array( \
+ blocksize, num_states, indices, num_indices, is_active_op) \
+ if (ccl_gpu_global_size_x() == 1) \
+ cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
+ else \
+ gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
+# else
+# define gpu_parallel_active_index_array( \
+ blocksize, num_states, indices, num_indices, is_active_op) \
+ gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
+# endif
#else
# define gpu_parallel_active_index_array( \