1 files changed, 65 insertions, 9 deletions
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index a6bd593bcb6..65c67c400fe 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -10,26 +10,83 @@
 #  include "util/string.h"
 #  include "util/time.h"
 
+#  include <IOKit/IOKitLib.h>
 #  include <pwd.h>
 #  include <sys/shm.h>
 #  include <time.h>
 
 CCL_NAMESPACE_BEGIN
 
-MetalGPUVendor MetalInfo::get_vendor_from_device_name(string const &device_name)
+string MetalInfo::get_device_name(id<MTLDevice> device)
 {
-  if (device_name.find("Intel") != string::npos) {
+  string device_name = [device.name UTF8String];
+  if (get_device_vendor(device) == METAL_GPU_APPLE) {
+    /* Append the GPU core count so we can distinguish between GPU variants in benchmarks. */
+    int gpu_core_count = get_apple_gpu_core_count(device);
+    device_name += string_printf(gpu_core_count ? " (GPU - %d cores)" : " (GPU)", gpu_core_count);
+  }
+  return device_name;
+}
+
+int MetalInfo::get_apple_gpu_core_count(id<MTLDevice> device)
+{
+  int core_count = 0;
+  if (@available(macos 12.0, *)) {
+    io_service_t gpu_service = IOServiceGetMatchingService(
+        kIOMainPortDefault, IORegistryEntryIDMatching(device.registryID));
+    if (CFNumberRef numberRef = (CFNumberRef)IORegistryEntryCreateCFProperty(
+            gpu_service, CFSTR("gpu-core-count"), 0, 0)) {
+      if (CFGetTypeID(numberRef) == CFNumberGetTypeID()) {
+        CFNumberGetValue(numberRef, kCFNumberSInt32Type, &core_count);
+      }
+      CFRelease(numberRef);
+    }
+  }
+  return core_count;
+}
+
+AppleGPUArchitecture MetalInfo::get_apple_gpu_architecture(id<MTLDevice> device)
+{
+  const char *device_name = [device.name UTF8String];
+  if (strstr(device_name, "M1")) {
+    return APPLE_M1;
+  }
+  else if (strstr(device_name, "M2")) {
+    return APPLE_M2;
+  }
+  return APPLE_UNKNOWN;
+}
+
+MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
+{
+  const char *device_name = [device.name UTF8String];
+  if (strstr(device_name, "Intel")) {
     return METAL_GPU_INTEL;
   }
-  else if (device_name.find("AMD") != string::npos) {
+  else if (strstr(device_name, "AMD")) {
     return METAL_GPU_AMD;
   }
-  else if (device_name.find("Apple") != string::npos) {
+  else if (strstr(device_name, "Apple")) {
     return METAL_GPU_APPLE;
   }
   return METAL_GPU_UNKNOWN;
 }
 
+int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
+{
+  if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) {
+    return atoi(str);
+  }
+
+  /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before
+   * sorting each partition by material. Partitioning into chunks of 65536 elements results in an
+   * overall render time speedup of up to 15%. */
+  if (get_device_vendor(device) == METAL_GPU_APPLE) {
+    return 65536;
+  }
+  return 0;
+}
+
 vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
 {
   static vector<id<MTLDevice>> usable_devices;
@@ -41,9 +98,8 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
 
   metal_printf("Usable Metal devices:\n");
   for (id<MTLDevice> device in MTLCopyAllDevices()) {
-    const char *device_name = [device.name UTF8String];
-
-    MetalGPUVendor vendor = get_vendor_from_device_name(device_name);
+    string device_name = get_device_name(device);
+    MetalGPUVendor vendor = get_device_vendor(device);
     bool usable = false;
 
     if (@available(macos 12.2, *)) {
@@ -55,12 +111,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
     }
 
     if (usable) {
-      metal_printf("- %s\n", device_name);
+      metal_printf("- %s\n", device_name.c_str());
       [device retain];
       usable_devices.push_back(device);
     }
     else {
-      metal_printf("  (skipping \"%s\")\n", device_name);
+      metal_printf("  (skipping \"%s\")\n", device_name.c_str());
     }
   }
   if (usable_devices.empty()) {