Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/device/metal/device_impl.mm6
-rw-r--r--intern/cycles/device/metal/queue.h1
-rw-r--r--intern/cycles/device/metal/queue.mm21
-rw-r--r--intern/cycles/device/metal/util.h1
-rw-r--r--intern/cycles/device/metal/util.mm18
-rw-r--r--intern/cycles/device/queue.h8
-rw-r--r--intern/cycles/integrator/path_trace.cpp2
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.cpp21
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.h3
-rw-r--r--intern/cycles/kernel/integrator/state.h3
-rw-r--r--intern/cycles/kernel/integrator/state_flow.h12
11 files changed, 82 insertions, 14 deletions
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 87c83242240..ba9317e3204 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -217,6 +217,10 @@ string MetalDevice::get_source(const uint kernel_features)
build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
}
+ if (MetalInfo::optimal_sort_partition_elements(mtlDevice) > 0) {
+ build_options += " -D__KERNEL_SORT_PARTITIONING__ ";
+ }
+
if (use_metalrt) {
build_options += "-D__METALRT__ ";
if (motion_blur) {
@@ -652,7 +656,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
/* Update data storage pointers in launch parameters. */
if (strcmp(name, "integrator_state") == 0) {
/* IntegratorStateGPU is contiguous pointers */
- const size_t pointer_block_size = sizeof(IntegratorStateGPU);
+ const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
update_launch_pointers(
offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
}
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index b0bd487c86d..836289172f7 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue {
virtual int num_concurrent_states(const size_t) const override;
virtual int num_concurrent_busy_states() const override;
+ virtual int num_sort_partitions(const size_t) const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 03e60b6bb6e..6a9cc552098 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -293,6 +293,23 @@ int MetalDeviceQueue::num_concurrent_busy_states() const
return result;
}
+int MetalDeviceQueue::num_sort_partitions(const size_t state_size) const
+{
+ /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
+ * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
+ */
+ if (metal_device_->launch_params.data.max_shaders >= 300) {
+ return 1;
+ }
+
+ const int optimal_partition_elements = MetalInfo::optimal_sort_partition_elements(
+ metal_device_->mtlDevice);
+ if (optimal_partition_elements) {
+ return num_concurrent_states(state_size) / optimal_partition_elements;
+ }
+ return 1;
+}
+
void MetalDeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
@@ -359,7 +376,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
/* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
- sizeof(IntegratorStateGPU);
+ offsetof(IntegratorStateGPU, sort_partition_divisor);
size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
(uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
@@ -416,7 +433,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* this relies on IntegratorStateGPU layout being contiguous device_ptrs */
const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
- sizeof(IntegratorStateGPU);
+ offsetof(IntegratorStateGPU, sort_partition_divisor);
for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
int pointer_index = int(offset / sizeof(device_ptr));
MetalDevice::MetalMem *mmem = *(
diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h
index fd32d8a260f..a988d01d361 100644
--- a/intern/cycles/device/metal/util.h
+++ b/intern/cycles/device/metal/util.h
@@ -37,6 +37,7 @@ struct MetalInfo {
static int get_apple_gpu_core_count(id<MTLDevice> device);
static MetalGPUVendor get_device_vendor(id<MTLDevice> device);
static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device);
+ static int optimal_sort_partition_elements(id<MTLDevice> device);
static string get_device_name(id<MTLDevice> device);
};
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index a7a5b596b8f..c336dc310c8 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -72,6 +72,24 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
return METAL_GPU_UNKNOWN;
}
+int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
+{
+ if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) {
+ return atoi(str);
+ }
+
+ /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before
+ * sorting each partition by material. Partitioning into chunks of 65536 elements results in an
+ * overall render time speedup of up to 15%. */
+ if (get_device_vendor(device) == METAL_GPU_APPLE) {
+ AppleGPUArchitecture arch = get_apple_gpu_architecture(device);
+ if (arch == APPLE_M1 || arch == APPLE_M2) {
+ return 65536;
+ }
+ }
+ return 0;
+}
+
vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
{
static vector<id<MTLDevice>> usable_devices;
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 14a5db3a204..20308e4a106 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -105,6 +105,14 @@ class DeviceQueue {
* value. */
virtual int num_concurrent_busy_states() const = 0;
+ /* Number of partitions within which active indices are sorted by material ID.
+ * Using more partitions lets us trade off material coherence for better integrator state fetch
+ * locality. */
+ virtual int num_sort_partitions(const size_t /*state_size*/) const
+ {
+ return 1;
+ }
+
/* Initialize execution of kernels on this queue.
*
* Will, for example, load all data required by the kernels from Device to global or path state.
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 6912bf928cd..ed278821b46 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -373,7 +373,7 @@ void PathTrace::path_trace(RenderWork &render_work)
work_balance_infos_[i].time_spent += work_time;
work_balance_infos_[i].occupancy = statistics.occupancy;
- VLOG_WORK << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+ VLOG_INFO << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
<< work_time / num_samples
<< " seconds per sample), occupancy: " << statistics.occupancy;
});
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index e262c252ce3..d51e8a28bb4 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -182,18 +182,19 @@ void PathTraceWorkGPU::alloc_integrator_queue()
void PathTraceWorkGPU::alloc_integrator_sorting()
{
/* Allocate arrays for shader sorting. */
- const int max_shaders = device_scene_->data.max_shaders;
- if (integrator_shader_sort_counter_.size() < max_shaders) {
- integrator_shader_sort_counter_.alloc(max_shaders);
+ num_sort_partitions_ = queue_->num_sort_partitions(estimate_single_state_size());
+ const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+ if (integrator_shader_sort_counter_.size() < sort_buckets) {
+ integrator_shader_sort_counter_.alloc(sort_buckets);
integrator_shader_sort_counter_.zero_to_device();
- integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
+ integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
integrator_shader_raytrace_sort_counter_.zero_to_device();
- integrator_shader_mnee_sort_counter_.alloc(max_shaders);
+ integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
integrator_shader_mnee_sort_counter_.zero_to_device();
- integrator_shader_sort_prefix_sum_.alloc(max_shaders);
+ integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
integrator_shader_sort_prefix_sum_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
@@ -237,6 +238,10 @@ void PathTraceWorkGPU::init_execution()
{
queue_->init_execution();
+ /* Setup sort partitioning divisor for better cache utilization. */
+ integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
+ num_sort_partitions_);
+
/* Copy to device side struct in constant memory. */
device_->const_copy_to(
"integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
@@ -486,9 +491,9 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
/* Compute prefix sum of number of active paths with each shader. */
{
const int work_size = 1;
- int max_shaders = device_scene_->data.max_shaders;
+ int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
- DeviceKernelArguments args(&d_counter, &d_prefix_sum, &max_shaders);
+ DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets);
queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
}
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 4c10a221a30..a805258d1b5 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -156,6 +156,9 @@ class PathTraceWorkGPU : public PathTraceWork {
bool interop_use_checked_ = false;
bool interop_use_ = false;
+ /* Number of partitions to sort state indices into prior to material sort. */
+ int num_sort_partitions_;
+
/* Maximum number of concurrent integrator states. */
int max_num_paths_;
diff --git a/intern/cycles/kernel/integrator/state.h b/intern/cycles/kernel/integrator/state.h
index d6fef27f344..d10d31e930e 100644
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -127,6 +127,9 @@ typedef struct IntegratorStateGPU {
/* Index of main path which will be used by a next shadow catcher split. */
ccl_global int *next_main_path_index;
+
+ /* Divisor used to partition active indices by locality when sorting by material. */
+ uint sort_partition_divisor;
} IntegratorStateGPU;
/* Abstraction
diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h
index fed74d49434..d397ef385e7 100644
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -67,9 +67,17 @@ CCL_NAMESPACE_BEGIN
&kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+# ifdef __KERNEL_SORT_PARTITIONING__
+/* Sort first by truncated state index (for good locality), then by key (for good coherence). */
+# define INTEGRATOR_SORT_KEY(key, state) \
+ (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))
+# else
+# define INTEGRATOR_SORT_KEY(key, state) (key)
+# endif
+
# define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
{ \
- const int key_ = key; \
+ const int key_ = INTEGRATOR_SORT_KEY(key, state); \
atomic_fetch_and_add_uint32( \
&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
@@ -79,7 +87,7 @@ CCL_NAMESPACE_BEGIN
}
# define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
{ \
- const int key_ = key; \
+ const int key_ = INTEGRATOR_SORT_KEY(key, state); \
atomic_fetch_and_sub_uint32( \
&kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
atomic_fetch_and_add_uint32( \