Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Goudey <h.goudey@me.com>2022-11-11 04:14:48 +0300
committerHans Goudey <h.goudey@me.com>2022-11-11 04:41:13 +0300
commit9465b109af0b70244a36f9e68493e316d9f8b56f (patch)
treec4d933bf09f8313071556063156538ec3356b24c /intern/cycles/device/metal/kernel.mm
parent026d21a225521670c6b5083da9da61227da69e65 (diff)
parentca1642cd0c5cdf634fe2022c955d93983de95934 (diff)
Merge branch 'master' into refactor-mesh-position-genericrefactor-mesh-position-generic
Diffstat (limited to 'intern/cycles/device/metal/kernel.mm')
-rw-r--r--intern/cycles/device/metal/kernel.mm61
1 files changed, 49 insertions, 12 deletions
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 55938d1a03a..dc8af9a5358 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -45,6 +45,36 @@ bool kernel_has_intersection(DeviceKernel device_kernel)
struct ShaderCache {
ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
{
+ /* Initialize occupancy tuning LUT. */
+ if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) {
+ switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
+ default:
+ case APPLE_M2:
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
+ break;
+ case APPLE_M1:
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
+ break;
+ }
+ }
}
~ShaderCache();
@@ -73,6 +103,11 @@ struct ShaderCache {
std::function<void(MetalKernelPipeline *)> completionHandler;
};
+ struct OccupancyTuningParameters {
+ int threads_per_threadgroup = 0;
+ int num_threads_per_block = 0;
+ } occupancy_tuning[DEVICE_KERNEL_NUM];
+
std::mutex cache_mutex;
PipelineCollection pipelines[DEVICE_KERNEL_NUM];
@@ -230,6 +265,13 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
request.pipeline->device_kernel = device_kernel;
request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
+ if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
+ request.pipeline->threads_per_threadgroup =
+ occupancy_tuning[device_kernel].threads_per_threadgroup;
+ request.pipeline->num_threads_per_block =
+ occupancy_tuning[device_kernel].num_threads_per_block;
+ }
+
/* metalrt options */
request.pipeline->use_metalrt = device->use_metalrt;
request.pipeline->metalrt_hair = device->use_metalrt &&
@@ -374,13 +416,6 @@ void MetalKernelPipeline::compile()
const std::string function_name = std::string("cycles_metal_") +
device_kernel_as_string(device_kernel);
- int threads_per_threadgroup = this->threads_per_threadgroup;
- if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
- device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
- /* Always use 512 for the sorting kernels */
- threads_per_threadgroup = 512;
- }
-
NSString *entryPoint = [@(function_name.c_str()) copy];
NSError *error = NULL;
@@ -644,12 +679,14 @@ void MetalKernelPipeline::compile()
return;
}
- int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
- computePipelineState.threadExecutionWidth);
- num_threads_per_block = std::max(num_threads_per_block,
- (int)computePipelineState.threadExecutionWidth);
+ if (!num_threads_per_block) {
+ num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+ computePipelineState.threadExecutionWidth);
+ num_threads_per_block = std::max(num_threads_per_block,
+ (int)computePipelineState.threadExecutionWidth);
+ }
+
this->pipeline = computePipelineState;
- this->num_threads_per_block = num_threads_per_block;
if (@available(macOS 11.0, *)) {
if (creating_new_archive || recreate_archive) {