diff options
Diffstat (limited to 'intern/cycles/kernel/device/gpu')
-rw-r--r-- | intern/cycles/kernel/device/gpu/image.h | 4 | ||||
-rw-r--r-- | intern/cycles/kernel/device/gpu/kernel.h | 14 | ||||
-rw-r--r-- | intern/cycles/kernel/device/gpu/parallel_active_index.h | 100 |
3 files changed, 104 insertions, 14 deletions
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h index 29d851ae478..a8c72645569 100644 --- a/intern/cycles/kernel/device/gpu/image.h +++ b/intern/cycles/kernel/device/gpu/image.h @@ -181,7 +181,7 @@ ccl_device_noinline typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_in ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y) { - ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + ccl_global const TextureInfo &info = kernel_data_fetch(texture_info, id); /* float4, byte4, ushort4 and half4 */ const int texture_type = info.data_type; @@ -216,7 +216,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg, float3 P, InterpolationType interp) { - ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + ccl_global const TextureInfo &info = kernel_data_fetch(texture_info, id); if (info.use_transform_3d) { P = transform_point(&info.transform_3d, P); diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index d657571a5fa..d7d2000775f 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -14,6 +14,8 @@ #ifdef __KERNEL_METAL__ # include "kernel/device/metal/context_begin.h" +#elif defined(__KERNEL_ONEAPI__) +# include "kernel/device/oneapi/context_begin.h" #endif #include "kernel/device/gpu/work_stealing.h" @@ -40,6 +42,8 @@ #ifdef __KERNEL_METAL__ # include "kernel/device/metal/context_end.h" +#elif defined(__KERNEL_ONEAPI__) +# include "kernel/device/oneapi/context_end.h" #endif #include "kernel/film/read.h" @@ -242,7 +246,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_postfix #if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__) -constant int __dummy_constant [[function_constant(0)]]; +constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]]; #endif ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) @@ -522,7 +526,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) bool converged = true; if (x < sw && y < sh) { - converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check( + converged = ccl_gpu_kernel_call(film_adaptive_sampling_convergence_check( nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride)); } @@ -549,7 +553,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (y < sh) { ccl_gpu_kernel_call( - kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride)); + film_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride)); } } ccl_gpu_kernel_postfix @@ -568,7 +572,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (x < sw) { ccl_gpu_kernel_call( - kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride)); + film_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride)); } } ccl_gpu_kernel_postfix @@ -585,7 +589,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int pixel_index = ccl_gpu_global_id_x(); if (pixel_index < num_pixels) { - ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index)); + ccl_gpu_kernel_call(film_cryptomatte_post(nullptr, render_buffer, pixel_index)); } } ccl_gpu_kernel_postfix diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index 7d7266d5edf..c1df49c4f49 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -18,15 +18,68 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 #endif -#ifndef __KERNEL_METAL__ +/* TODO: abstract more device differences, define ccl_gpu_local_syncthreads, + * ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices + * and keep device specific code in compat.h */ + +#ifdef __KERNEL_ONEAPI__ +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +template<typename IsActiveOp> +void cpu_serial_active_index_array_impl(const uint num_states, + ccl_global int *ccl_restrict indices, + ccl_global int *ccl_restrict num_indices, + IsActiveOp is_active_op) +{ + int write_index = 0; + for (int state_index = 0; state_index < num_states; state_index++) { + if (is_active_op(state_index)) + indices[write_index++] = state_index; + } + *num_indices = write_index; + return; +} +# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */ + +template<typename IsActiveOp> +void gpu_parallel_active_index_array_impl(const uint num_states, + ccl_global int *ccl_restrict indices, + ccl_global int *ccl_restrict num_indices, + IsActiveOp is_active_op) +{ + const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>(); + const uint blocksize = item_id.get_local_range(0); + + sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1], + sycl::access::address_space::local_space> + ptr = sycl::ext::oneapi::group_local_memory< + int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group()); + int *warp_offset = *ptr; + + /* NOTE(@nsirgien): Here we calculate the same value as below but + * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into + * something faster already but DPC++ doesn't, so it's better to use + * direct request of needed parameters - switching from this computation to computation below + * will cause 2.5x performance slowdown. */ + const uint thread_index = item_id.get_local_id(0); + const uint thread_warp = item_id.get_sub_group().get_local_id(); + + const uint warp_index = item_id.get_sub_group().get_group_id(); + const uint num_warps = item_id.get_sub_group().get_group_range()[0]; + + const uint state_index = item_id.get_global_id(0); + + /* Test if state corresponding to this thread is active. */ + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; +#else /* !__KERNEL__ONEAPI__ */ +# ifndef __KERNEL_METAL__ template<uint blocksize, typename IsActiveOp> __device__ -#endif +# endif void gpu_parallel_active_index_array_impl(const uint num_states, ccl_global int *indices, ccl_global int *num_indices, -#ifdef __KERNEL_METAL__ +# ifdef __KERNEL_METAL__ const uint is_active, const uint blocksize, const int thread_index, @@ -37,7 +90,7 @@ __device__ const int num_warps, threadgroup int *warp_offset) { -#else +# else IsActiveOp is_active_op) { extern ccl_gpu_shared int warp_offset[]; @@ -52,18 +105,33 @@ __device__ /* Test if state corresponding to this thread is active. */ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; -#endif - +# endif +#endif /* !__KERNEL_ONEAPI__ */ /* For each thread within a warp compute how many other active states precede it. */ +#ifdef __KERNEL_ONEAPI__ + const uint thread_offset = sycl::exclusive_scan_over_group( + item_id.get_sub_group(), is_active, std::plus<>()); +#else const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & ccl_gpu_thread_mask(thread_warp)); +#endif /* Last thread in warp stores number of active states for each warp. */ +#ifdef __KERNEL_ONEAPI__ + if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) { +#else if (thread_warp == ccl_gpu_warp_size - 1) { +#endif warp_offset[warp_index] = thread_offset + is_active; } +#ifdef __KERNEL_ONEAPI__ + /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important, + * so faster local barriers can be used. */ + ccl_gpu_local_syncthreads(); +#else ccl_gpu_syncthreads(); +#endif /* Last thread in block converts per-warp sizes to offsets, increments global size of * index array and gets offset to write to. */ @@ -80,7 +148,13 @@ __device__ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); } +#ifdef __KERNEL_ONEAPI__ + /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset), + * so faster local barriers can be used. */ + ccl_gpu_local_syncthreads(); +#else ccl_gpu_syncthreads(); +#endif /* Write to index array. */ if (is_active) { @@ -107,7 +181,19 @@ __device__ simd_group_index, \ num_simd_groups, \ simdgroup_offset) - +#elif defined(__KERNEL_ONEAPI__) +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + if (ccl_gpu_global_size_x() == 1) \ + cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \ + else \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op); +# else +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) +# endif #else # define gpu_parallel_active_index_array( \ |