Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorMichael Jones <michael_p_jones@apple.com>2021-12-07 18:11:35 +0300
committerMichael Jones <michael_p_jones@apple.com>2021-12-07 18:52:21 +0300
commit9558fa5196033390111a2348caa66ab18b8a4f89 (patch)
treeacc3ed446f709390abfef5f97f82c1ed9abe0100 /intern
parent565b33c0ad31966b860123837d2c4b5a8cbedad2 (diff)
Cycles: Metal host-side code
This patch adds the Metal host-side code: - Add all core host-side Metal backend files (device_impl, queue, etc) - Add MetalRT BVH setup files - Integrate with Cycles device enumeration code - Revive `path_source_replace_includes` in util/path (required for MSL compilation) This patch also includes a couple of small kernel-side fixes: - Add an implementation of `lgammaf` for Metal [Nemes, Gergő (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik](https://users.renyi.hu/~gergonemes/) - include "work_stealing.h" inside the Metal context class because it accesses state now Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13423
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/blender/CMakeLists.txt5
-rw-r--r--intern/cycles/blender/addon/engine.py2
-rw-r--r--intern/cycles/blender/addon/properties.py12
-rw-r--r--intern/cycles/blender/addon/ui.py5
-rw-r--r--intern/cycles/blender/device.cpp4
-rw-r--r--intern/cycles/blender/python.cpp9
-rw-r--r--intern/cycles/bvh/CMakeLists.txt2
-rw-r--r--intern/cycles/bvh/bvh.cpp10
-rw-r--r--intern/cycles/bvh/metal.h35
-rw-r--r--intern/cycles/bvh/metal.mm33
-rw-r--r--intern/cycles/cmake/external_libs.cmake14
-rw-r--r--intern/cycles/device/CMakeLists.txt49
-rw-r--r--intern/cycles/device/device.cpp40
-rw-r--r--intern/cycles/device/device.h3
-rw-r--r--intern/cycles/device/memory.h1
-rw-r--r--intern/cycles/device/metal/bvh.h66
-rw-r--r--intern/cycles/device/metal/bvh.mm813
-rw-r--r--intern/cycles/device/metal/device.h37
-rw-r--r--intern/cycles/device/metal/device.mm136
-rw-r--r--intern/cycles/device/metal/device_impl.h166
-rw-r--r--intern/cycles/device/metal/device_impl.mm1008
-rw-r--r--intern/cycles/device/metal/kernel.h168
-rw-r--r--intern/cycles/device/metal/kernel.mm523
-rw-r--r--intern/cycles/device/metal/queue.h97
-rw-r--r--intern/cycles/device/metal/queue.mm602
-rw-r--r--intern/cycles/device/metal/util.h101
-rw-r--r--intern/cycles/device/metal/util.mm241
-rw-r--r--intern/cycles/device/multi/device.cpp3
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h7
-rw-r--r--intern/cycles/kernel/device/metal/compat.h2
-rw-r--r--intern/cycles/kernel/device/metal/kernel.metal12
-rw-r--r--intern/cycles/util/math.h14
-rw-r--r--intern/cycles/util/path.cpp164
-rw-r--r--intern/cycles/util/path.h3
34 files changed, 4355 insertions, 32 deletions
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index f0540486656..b4a4d487355 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -101,6 +101,11 @@ add_definitions(${GL_DEFINITIONS})
if(WITH_CYCLES_DEVICE_HIP)
add_definitions(-DWITH_HIP)
endif()
+
+if(WITH_CYCLES_DEVICE_METAL)
+ add_definitions(-DWITH_METAL)
+endif()
+
if(WITH_MOD_FLUID)
add_definitions(-DWITH_FLUID)
endif()
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index e5bb77a834a..910ac4a373e 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -28,7 +28,7 @@ def _configure_argument_parser():
action='store_true')
parser.add_argument("--cycles-device",
help="Set the device to use for Cycles, overriding user preferences and the scene setting."
- "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'"
+ "Valid options are 'CPU', 'CUDA', 'OPTIX', 'HIP' or 'METAL'."
"Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
default=None)
return parser
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 0de936ddb11..8569cb7d946 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -111,7 +111,8 @@ enum_device_type = (
('CPU', "CPU", "CPU", 0),
('CUDA', "CUDA", "CUDA", 1),
('OPTIX', "OptiX", "OptiX", 3),
- ("HIP", "HIP", "HIP", 4)
+ ('HIP', "HIP", "HIP", 4),
+ ('METAL', "Metal", "Metal", 5)
)
enum_texture_limit = (
@@ -1312,8 +1313,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
def get_device_types(self, context):
import _cycles
- has_cuda, has_optix, has_hip = _cycles.get_device_types()
-
+ has_cuda, has_optix, has_hip, has_metal = _cycles.get_device_types()
list = [('NONE', "None", "Don't use compute device", 0)]
if has_cuda:
list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
@@ -1321,6 +1321,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
if has_hip:
list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4))
+ if has_metal:
+ list.append(('METAL', "Metal", "Use Metal for GPU acceleration", 5))
return list
@@ -1346,7 +1348,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
def update_device_entries(self, device_list):
for device in device_list:
- if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}:
+ if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP', 'METAL'}:
continue
# Try to find existing Device entry
entry = self.find_existing_device_entry(device)
@@ -1390,7 +1392,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
import _cycles
# Ensure `self.devices` is not re-allocated when the second call to
# get_devices_for_type is made, freeing items from the first list.
- for device_type in ('CUDA', 'OPTIX', 'HIP'):
+ for device_type in ('CUDA', 'OPTIX', 'HIP', 'METAL'):
self.update_device_entries(_cycles.available_devices(device_type))
# Deprecated: use refresh_devices instead.
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 6fd21db38ae..fd86d75a301 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -97,6 +97,11 @@ def use_cpu(context):
return (get_device_type(context) == 'NONE' or cscene.device == 'CPU')
+def use_metal(context):
+ cscene = context.scene.cycles
+
+ return (get_device_type(context) == 'METAL' and cscene.device == 'GPU')
+
def use_cuda(context):
cscene = context.scene.cycles
diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp
index 9fabc33a96b..d39381ac6f1 100644
--- a/intern/cycles/blender/device.cpp
+++ b/intern/cycles/blender/device.cpp
@@ -27,6 +27,7 @@ enum ComputeDevice {
COMPUTE_DEVICE_CUDA = 1,
COMPUTE_DEVICE_OPTIX = 3,
COMPUTE_DEVICE_HIP = 4,
+ COMPUTE_DEVICE_METAL = 5,
COMPUTE_DEVICE_NUM
};
@@ -85,6 +86,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
else if (compute_device == COMPUTE_DEVICE_HIP) {
mask |= DEVICE_MASK_HIP;
}
+ else if (compute_device == COMPUTE_DEVICE_METAL) {
+ mask |= DEVICE_MASK_METAL;
+ }
vector<DeviceInfo> devices = Device::available_devices(mask);
/* Match device preferences and available devices. */
diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp
index 012122cf9e3..024dae306b0 100644
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -906,16 +906,18 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
{
vector<DeviceType> device_types = Device::available_types();
- bool has_cuda = false, has_optix = false, has_hip = false;
+ bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false;
foreach (DeviceType device_type, device_types) {
has_cuda |= (device_type == DEVICE_CUDA);
has_optix |= (device_type == DEVICE_OPTIX);
has_hip |= (device_type == DEVICE_HIP);
+ has_metal |= (device_type == DEVICE_METAL);
}
- PyObject *list = PyTuple_New(3);
+ PyObject *list = PyTuple_New(4);
PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
+ PyTuple_SET_ITEM(list, 3, PyBool_FromLong(has_metal));
return list;
}
@@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
else if (override == "HIP") {
BlenderSession::device_override = DEVICE_MASK_HIP;
}
+ else if (override == "METAL") {
+ BlenderSession::device_override = DEVICE_MASK_METAL;
+ }
else {
printf("\nError: %s is not a valid Cycles device.\n", override.c_str());
Py_RETURN_FALSE;
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index 9edc30cf9c4..cdaa6628be2 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -31,6 +31,7 @@ set(SRC
sort.cpp
split.cpp
unaligned.cpp
+ metal.mm
)
set(SRC_HEADERS
@@ -46,6 +47,7 @@ set(SRC_HEADERS
sort.h
split.h
unaligned.h
+ metal.h
)
set(LIB
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index d3c8e4db6d0..540bf52f7ac 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -19,6 +19,7 @@
#include "bvh/bvh2.h"
#include "bvh/embree.h"
+#include "bvh/metal.h"
#include "bvh/multi.h"
#include "bvh/optix.h"
@@ -106,12 +107,17 @@ BVH *BVH::create(const BVHParams &params,
(void)device;
break;
#endif
+ case BVH_LAYOUT_METAL:
+#ifdef WITH_METAL
+ return bvh_metal_create(params, geometry, objects, device);
+#else
+ (void)device;
+ break;
+#endif
case BVH_LAYOUT_MULTI_OPTIX:
case BVH_LAYOUT_MULTI_OPTIX_EMBREE:
case BVH_LAYOUT_MULTI_METAL_EMBREE:
return new BVHMulti(params, geometry, objects);
- case BVH_LAYOUT_METAL:
- /* host-side changes for BVH_LAYOUT_METAL are imminent */
case BVH_LAYOUT_NONE:
case BVH_LAYOUT_ALL:
break;
diff --git a/intern/cycles/bvh/metal.h b/intern/cycles/bvh/metal.h
new file mode 100644
index 00000000000..8de07927e61
--- /dev/null
+++ b/intern/cycles/bvh/metal.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH_METAL_H__
+#define __BVH_METAL_H__
+
+#ifdef WITH_METAL
+
+# include "bvh/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVH *bvh_metal_create(const BVHParams &params,
+ const vector<Geometry *> &geometry,
+ const vector<Object *> &objects,
+ Device *device);
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
+
+#endif /* __BVH_METAL_H__ */
diff --git a/intern/cycles/bvh/metal.mm b/intern/cycles/bvh/metal.mm
new file mode 100644
index 00000000000..90a52012f12
--- /dev/null
+++ b/intern/cycles/bvh/metal.mm
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+# include "device/metal/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVH *bvh_metal_create(const BVHParams &params,
+ const vector<Geometry *> &geometry,
+ const vector<Object *> &objects,
+ Device *device)
+{
+ return new BVHMetal(params, geometry, objects, device);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index c1244ab740b..9967a775184 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -551,4 +551,18 @@ if(NOT WITH_HIP_DYNLOAD)
set(WITH_HIP_DYNLOAD ON)
endif()
+###########################################################################
+# Metal
+###########################################################################
+
+if(WITH_CYCLES_DEVICE_METAL)
+ FIND_LIBRARY(METAL_LIBRARY Metal)
+ if (METAL_LIBRARY)
+ message(STATUS "Found Metal: ${METAL_LIBRARY}")
+ else()
+ message(STATUS "Metal not found, disabling WITH_CYCLES_DEVICE_METAL")
+ set(WITH_CYCLES_DEVICE_METAL OFF)
+ endif()
+endif()
+
unset(_cycles_lib_dir)
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 99b1fc8135d..7f1e9ff3d0f 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -43,7 +43,7 @@ if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
add_definitions(-DWITH_HIP_DYNLOAD)
endif()
-set(SRC
+set(SRC_BASE
device.cpp
denoise.cpp
graphics_interop.cpp
@@ -104,6 +104,21 @@ set(SRC_MULTI
multi/device.h
)
+set(SRC_METAL
+ metal/bvh.mm
+ metal/bvh.h
+ metal/device.mm
+ metal/device.h
+ metal/device_impl.mm
+ metal/device_impl.h
+ metal/kernel.mm
+ metal/kernel.h
+ metal/queue.mm
+ metal/queue.h
+ metal/util.mm
+ metal/util.h
+)
+
set(SRC_OPTIX
optix/device.cpp
optix/device.h
@@ -123,6 +138,17 @@ set(SRC_HEADERS
queue.h
)
+set(SRC
+ ${SRC_BASE}
+ ${SRC_CPU}
+ ${SRC_CUDA}
+ ${SRC_HIP}
+ ${SRC_DUMMY}
+ ${SRC_MULTI}
+ ${SRC_OPTIX}
+ ${SRC_HEADERS}
+)
+
set(LIB
cycles_kernel
cycles_util
@@ -158,6 +184,15 @@ endif()
if(WITH_CYCLES_DEVICE_OPTIX)
add_definitions(-DWITH_OPTIX)
endif()
+if(WITH_CYCLES_DEVICE_METAL)
+ list(APPEND LIB
+ ${METAL_LIBRARY}
+ )
+ add_definitions(-DWITH_METAL)
+ list(APPEND SRC
+ ${SRC_METAL}
+ )
+endif()
if(WITH_OPENIMAGEDENOISE)
list(APPEND LIB
@@ -168,20 +203,12 @@ endif()
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
-cycles_add_library(cycles_device "${LIB}"
- ${SRC}
- ${SRC_CPU}
- ${SRC_CUDA}
- ${SRC_HIP}
- ${SRC_DUMMY}
- ${SRC_MULTI}
- ${SRC_OPTIX}
- ${SRC_HEADERS}
-)
+cycles_add_library(cycles_device "${LIB}" ${SRC})
source_group("cpu" FILES ${SRC_CPU})
source_group("cuda" FILES ${SRC_CUDA})
source_group("dummy" FILES ${SRC_DUMMY})
source_group("multi" FILES ${SRC_MULTI})
+source_group("metal" FILES ${SRC_METAL})
source_group("optix" FILES ${SRC_OPTIX})
source_group("common" FILES ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index bfbcdb20d5e..2b067d57158 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -27,6 +27,7 @@
#include "device/cuda/device.h"
#include "device/dummy/device.h"
#include "device/hip/device.h"
+#include "device/metal/device.h"
#include "device/multi/device.h"
#include "device/optix/device.h"
@@ -49,6 +50,7 @@ vector<DeviceInfo> Device::cuda_devices;
vector<DeviceInfo> Device::optix_devices;
vector<DeviceInfo> Device::cpu_devices;
vector<DeviceInfo> Device::hip_devices;
+vector<DeviceInfo> Device::metal_devices;
uint Device::devices_initialized_mask = 0;
/* Device */
@@ -105,6 +107,12 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
break;
#endif
+#ifdef WITH_METAL
+ case DEVICE_METAL:
+ if (device_metal_init())
+ device = device_metal_create(info, stats, profiler);
+ break;
+#endif
default:
break;
}
@@ -128,6 +136,8 @@ DeviceType Device::type_from_string(const char *name)
return DEVICE_MULTI;
else if (strcmp(name, "HIP") == 0)
return DEVICE_HIP;
+ else if (strcmp(name, "METAL") == 0)
+ return DEVICE_METAL;
return DEVICE_NONE;
}
@@ -144,6 +154,8 @@ string Device::string_from_type(DeviceType type)
return "MULTI";
else if (type == DEVICE_HIP)
return "HIP";
+ else if (type == DEVICE_METAL)
+ return "METAL";
return "";
}
@@ -161,7 +173,9 @@ vector<DeviceType> Device::available_types()
#ifdef WITH_HIP
types.push_back(DEVICE_HIP);
#endif
-
+#ifdef WITH_METAL
+ types.push_back(DEVICE_METAL);
+#endif
return types;
}
@@ -227,6 +241,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
}
}
+#ifdef WITH_METAL
+ if (mask & DEVICE_MASK_METAL) {
+ if (!(devices_initialized_mask & DEVICE_MASK_METAL)) {
+ if (device_metal_init()) {
+ device_metal_info(metal_devices);
+ }
+ devices_initialized_mask |= DEVICE_MASK_METAL;
+ }
+ foreach (DeviceInfo &info, metal_devices) {
+ devices.push_back(info);
+ }
+ }
+#endif
+
return devices;
}
@@ -266,6 +294,15 @@ string Device::device_capabilities(uint mask)
}
#endif
+#ifdef WITH_METAL
+ if (mask & DEVICE_MASK_METAL) {
+ if (device_metal_init()) {
+ capabilities += "\nMetal device capabilities:\n";
+ capabilities += device_metal_capabilities();
+ }
+ }
+#endif
+
return capabilities;
}
@@ -354,6 +391,7 @@ void Device::free_memory()
optix_devices.free_memory();
hip_devices.free_memory();
cpu_devices.free_memory();
+ metal_devices.free_memory();
}
unique_ptr<DeviceQueue> Device::gpu_queue_create()
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 346632de314..c032773ddd0 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -52,6 +52,7 @@ enum DeviceType {
DEVICE_MULTI,
DEVICE_OPTIX,
DEVICE_HIP,
+ DEVICE_METAL,
DEVICE_DUMMY,
};
@@ -60,6 +61,7 @@ enum DeviceTypeMask {
DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
DEVICE_MASK_HIP = (1 << DEVICE_HIP),
+ DEVICE_MASK_METAL = (1 << DEVICE_METAL),
DEVICE_MASK_ALL = ~0
};
@@ -281,6 +283,7 @@ class Device {
static vector<DeviceInfo> optix_devices;
static vector<DeviceInfo> cpu_devices;
static vector<DeviceInfo> hip_devices;
+ static vector<DeviceInfo> metal_devices;
static uint devices_initialized_mask;
};
diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h
index d3ebebb34b4..2db3ac9a440 100644
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -263,6 +263,7 @@ class device_memory {
friend class CUDADevice;
friend class OptiXDevice;
friend class HIPDevice;
+ friend class MetalDevice;
/* Only create through subclasses. */
device_memory(Device *device, const char *name, MemoryType type);
diff --git a/intern/cycles/device/metal/bvh.h b/intern/cycles/device/metal/bvh.h
new file mode 100644
index 00000000000..cbc5ca7d2c3
--- /dev/null
+++ b/intern/cycles/device/metal/bvh.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_METAL
+
+# include "bvh/bvh.h"
+# include "bvh/params.h"
+# include "device/memory.h"
+
+# include <Metal/Metal.h>
+
+CCL_NAMESPACE_BEGIN
+
+class BVHMetal : public BVH {
+ public:
+ API_AVAILABLE(macos(11.0))
+ id<MTLAccelerationStructure> accel_struct = nil;
+ bool accel_struct_building = false;
+
+ API_AVAILABLE(macos(11.0))
+ vector<id<MTLAccelerationStructure>> blas_array;
+
+ bool motion_blur = false;
+
+ Stats &stats;
+
+ bool build(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
+
+ BVHMetal(const BVHParams &params,
+ const vector<Geometry *> &geometry,
+ const vector<Object *> &objects,
+ Device *device);
+ virtual ~BVHMetal();
+
+ bool build_BLAS(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
+ bool build_BLAS_mesh(Progress &progress,
+ id<MTLDevice> device,
+ id<MTLCommandQueue> queue,
+ Geometry *const geom,
+ bool refit);
+ bool build_BLAS_hair(Progress &progress,
+ id<MTLDevice> device,
+ id<MTLCommandQueue> queue,
+ Geometry *const geom,
+ bool refit);
+ bool build_TLAS(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm
new file mode 100644
index 00000000000..7c79196845a
--- /dev/null
+++ b/intern/cycles/device/metal/bvh.mm
@@ -0,0 +1,813 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+# include "scene/hair.h"
+# include "scene/mesh.h"
+# include "scene/object.h"
+
+# include "util/progress.h"
+
+# include "device/metal/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+# define BVH_status(...) \
+ { \
+ string str = string_printf(__VA_ARGS__); \
+ progress.set_substatus(str); \
+ }
+
+BVHMetal::BVHMetal(const BVHParams &params_,
+ const vector<Geometry *> &geometry_,
+ const vector<Object *> &objects_,
+ Device *device)
+ : BVH(params_, geometry_, objects_), stats(device->stats)
+{
+}
+
+BVHMetal::~BVHMetal()
+{
+ if (@available(macos 12.0, *)) {
+ if (accel_struct) {
+ stats.mem_free(accel_struct.allocatedSize);
+ [accel_struct release];
+ }
+ }
+}
+
+bool BVHMetal::build_BLAS_mesh(Progress &progress,
+ id<MTLDevice> device,
+ id<MTLCommandQueue> queue,
+ Geometry *const geom,
+ bool refit)
+{
+ if (@available(macos 12.0, *)) {
+ /* Build BLAS for triangle primitives */
+ Mesh *const mesh = static_cast<Mesh *const>(geom);
+ if (mesh->num_triangles() == 0) {
+ return false;
+ }
+
+ /*------------------------------------------------*/
+ BVH_status(
+ "Building mesh BLAS | %7d tris | %s", (int)mesh->num_triangles(), geom->name.c_str());
+ /*------------------------------------------------*/
+
+ const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC);
+
+ const array<float3> &verts = mesh->get_verts();
+ const array<int> &tris = mesh->get_triangles();
+ const size_t num_verts = verts.size();
+ const size_t num_indices = tris.size();
+
+ size_t num_motion_steps = 1;
+ Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+ if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+ num_motion_steps = mesh->get_motion_steps();
+ }
+
+ MTLResourceOptions storage_mode;
+ if (device.hasUnifiedMemory) {
+ storage_mode = MTLResourceStorageModeShared;
+ }
+ else {
+ storage_mode = MTLResourceStorageModeManaged;
+ }
+
+ /* Upload the mesh data to the GPU */
+ id<MTLBuffer> posBuf = nil;
+ id<MTLBuffer> indexBuf = [device newBufferWithBytes:tris.data()
+ length:num_indices * sizeof(tris.data()[0])
+ options:storage_mode];
+
+ if (num_motion_steps == 1) {
+ posBuf = [device newBufferWithBytes:verts.data()
+ length:num_verts * sizeof(verts.data()[0])
+ options:storage_mode];
+ }
+ else {
+ posBuf = [device newBufferWithLength:num_verts * num_motion_steps * sizeof(verts.data()[0])
+ options:storage_mode];
+ float3 *dest_data = (float3 *)[posBuf contents];
+ size_t center_step = (num_motion_steps - 1) / 2;
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ const float3 *verts = mesh->get_verts().data();
+
+ /* The center step for motion vertices is not stored in the attribute. */
+ if (step != center_step) {
+ verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
+ }
+ memcpy(dest_data + num_verts * step, verts, num_verts * sizeof(float3));
+ }
+ if (storage_mode == MTLResourceStorageModeManaged) {
+ [posBuf didModifyRange:NSMakeRange(0, posBuf.length)];
+ }
+ }
+
+ /* Create an acceleration structure. */
+ MTLAccelerationStructureGeometryDescriptor *geomDesc;
+ if (num_motion_steps > 1) {
+ std::vector<MTLMotionKeyframeData *> vertex_ptrs;
+ vertex_ptrs.reserve(num_motion_steps);
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ MTLMotionKeyframeData *k = [MTLMotionKeyframeData data];
+ k.buffer = posBuf;
+ k.offset = num_verts * step * sizeof(float3);
+ vertex_ptrs.push_back(k);
+ }
+
+ MTLAccelerationStructureMotionTriangleGeometryDescriptor *geomDescMotion =
+ [MTLAccelerationStructureMotionTriangleGeometryDescriptor descriptor];
+ geomDescMotion.vertexBuffers = [NSArray arrayWithObjects:vertex_ptrs.data()
+ count:vertex_ptrs.size()];
+ geomDescMotion.vertexStride = sizeof(verts.data()[0]);
+ geomDescMotion.indexBuffer = indexBuf;
+ geomDescMotion.indexBufferOffset = 0;
+ geomDescMotion.indexType = MTLIndexTypeUInt32;
+ geomDescMotion.triangleCount = num_indices / 3;
+ geomDescMotion.intersectionFunctionTableOffset = 0;
+
+ geomDesc = geomDescMotion;
+ }
+ else {
+ MTLAccelerationStructureTriangleGeometryDescriptor *geomDescNoMotion =
+ [MTLAccelerationStructureTriangleGeometryDescriptor descriptor];
+ geomDescNoMotion.vertexBuffer = posBuf;
+ geomDescNoMotion.vertexBufferOffset = 0;
+ geomDescNoMotion.vertexStride = sizeof(verts.data()[0]);
+ geomDescNoMotion.indexBuffer = indexBuf;
+ geomDescNoMotion.indexBufferOffset = 0;
+ geomDescNoMotion.indexType = MTLIndexTypeUInt32;
+ geomDescNoMotion.triangleCount = num_indices / 3;
+ geomDescNoMotion.intersectionFunctionTableOffset = 0;
+
+ geomDesc = geomDescNoMotion;
+ }
+
+ /* Force a single any-hit call, so shadow record-all behavior works correctly */
+ /* (Match optix behaviour: unsigned int build_flags =
+ * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
+ geomDesc.allowDuplicateIntersectionFunctionInvocation = false;
+
+ MTLPrimitiveAccelerationStructureDescriptor *accelDesc =
+ [MTLPrimitiveAccelerationStructureDescriptor descriptor];
+ accelDesc.geometryDescriptors = @[ geomDesc ];
+ if (num_motion_steps > 1) {
+ accelDesc.motionStartTime = 0.0f;
+ accelDesc.motionEndTime = 1.0f;
+ accelDesc.motionStartBorderMode = MTLMotionBorderModeClamp;
+ accelDesc.motionEndBorderMode = MTLMotionBorderModeClamp;
+ accelDesc.motionKeyframeCount = num_motion_steps;
+ }
+
+ if (!use_fast_trace_bvh) {
+ accelDesc.usage |= (MTLAccelerationStructureUsageRefit |
+ MTLAccelerationStructureUsagePreferFastBuild);
+ }
+
+ MTLAccelerationStructureSizes accelSizes = [device
+ accelerationStructureSizesWithDescriptor:accelDesc];
+ id<MTLAccelerationStructure> accel_uncompressed = [device
+ newAccelerationStructureWithSize:accelSizes.accelerationStructureSize];
+ id<MTLBuffer> scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize
+ options:MTLResourceStorageModePrivate];
+ id<MTLBuffer> sizeBuf = [device newBufferWithLength:8 options:MTLResourceStorageModeShared];
+ id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+ id<MTLAccelerationStructureCommandEncoder> accelEnc =
+ [accelCommands accelerationStructureCommandEncoder];
+ if (refit) {
+ [accelEnc refitAccelerationStructure:accel_struct
+ descriptor:accelDesc
+ destination:accel_uncompressed
+ scratchBuffer:scratchBuf
+ scratchBufferOffset:0];
+ }
+ else {
+ [accelEnc buildAccelerationStructure:accel_uncompressed
+ descriptor:accelDesc
+ scratchBuffer:scratchBuf
+ scratchBufferOffset:0];
+ }
+ if (use_fast_trace_bvh) {
+ [accelEnc writeCompactedAccelerationStructureSize:accel_uncompressed
+ toBuffer:sizeBuf
+ offset:0
+ sizeDataType:MTLDataTypeULong];
+ }
+ [accelEnc endEncoding];
+ [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+ /* free temp resources */
+ [scratchBuf release];
+ [indexBuf release];
+ [posBuf release];
+
+ if (use_fast_trace_bvh) {
+ /* Compact the accel structure */
+ uint64_t compressed_size = *(uint64_t *)sizeBuf.contents;
+
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+ id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+ id<MTLAccelerationStructureCommandEncoder> accelEnc =
+ [accelCommands accelerationStructureCommandEncoder];
+ id<MTLAccelerationStructure> accel = [device
+ newAccelerationStructureWithSize:compressed_size];
+ [accelEnc copyAndCompactAccelerationStructure:accel_uncompressed
+ toAccelerationStructure:accel];
+ [accelEnc endEncoding];
+ [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+ uint64_t allocated_size = [accel allocatedSize];
+ stats.mem_alloc(allocated_size);
+ accel_struct = accel;
+ [accel_uncompressed release];
+ accel_struct_building = false;
+ }];
+ [accelCommands commit];
+ });
+ }
+ else {
+ /* set our acceleration structure to the uncompressed structure */
+ accel_struct = accel_uncompressed;
+
+ uint64_t allocated_size = [accel_struct allocatedSize];
+ stats.mem_alloc(allocated_size);
+ accel_struct_building = false;
+ }
+ [sizeBuf release];
+ }];
+
+ accel_struct_building = true;
+ [accelCommands commit];
+
+ return true;
+ }
+ return false;
+}
+
+bool BVHMetal::build_BLAS_hair(Progress &progress,
+ id<MTLDevice> device,
+ id<MTLCommandQueue> queue,
+ Geometry *const geom,
+ bool refit)
+{
+ if (@available(macos 12.0, *)) {
+ /* Build BLAS for hair curves */
+ Hair *hair = static_cast<Hair *>(geom);
+ if (hair->num_curves() == 0) {
+ return false;
+ }
+
+ /*------------------------------------------------*/
+ BVH_status(
+ "Building hair BLAS | %7d curves | %s", (int)hair->num_curves(), geom->name.c_str());
+ /*------------------------------------------------*/
+
+ const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC);
+ const size_t num_segments = hair->num_segments();
+
+ size_t num_motion_steps = 1;
+ Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+ if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+ num_motion_steps = hair->get_motion_steps();
+ }
+
+ const size_t num_aabbs = num_segments * num_motion_steps;
+
+ MTLResourceOptions storage_mode;
+ if (device.hasUnifiedMemory) {
+ storage_mode = MTLResourceStorageModeShared;
+ }
+ else {
+ storage_mode = MTLResourceStorageModeManaged;
+ }
+
+ /* Allocate a GPU buffer for the AABB data and populate it */
+ id<MTLBuffer> aabbBuf = [device
+ newBufferWithLength:num_aabbs * sizeof(MTLAxisAlignedBoundingBox)
+ options:storage_mode];
+ MTLAxisAlignedBoundingBox *aabb_data = (MTLAxisAlignedBoundingBox *)[aabbBuf contents];
+
+ /* Get AABBs for each motion step */
+ size_t center_step = (num_motion_steps - 1) / 2;
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ /* The center step for motion vertices is not stored in the attribute */
+ const float3 *keys = hair->get_curve_keys().data();
+ if (step != center_step) {
+ size_t attr_offset = (step > center_step) ? step - 1 : step;
+ /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4) */
+ keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
+ }
+
+ for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+ const Hair::Curve curve = hair->get_curve(j);
+
+ for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+ {
+ BoundBox bounds = BoundBox::empty;
+ curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
+
+ const size_t index = step * num_segments + i;
+ aabb_data[index].min = (MTLPackedFloat3 &)bounds.min;
+ aabb_data[index].max = (MTLPackedFloat3 &)bounds.max;
+ }
+ }
+ }
+ }
+
+ if (storage_mode == MTLResourceStorageModeManaged) {
+ [aabbBuf didModifyRange:NSMakeRange(0, aabbBuf.length)];
+ }
+
+# if 0
+ for (size_t i=0; i<num_aabbs && i < 400; i++) {
+ MTLAxisAlignedBoundingBox& bb = aabb_data[i];
+ printf(" %d: %.1f,%.1f,%.1f -- %.1f,%.1f,%.1f\n", int(i), bb.min.x, bb.min.y, bb.min.z, bb.max.x, bb.max.y, bb.max.z);
+ }
+# endif
+
+ MTLAccelerationStructureGeometryDescriptor *geomDesc;
+ if (motion_blur) {
+ std::vector<MTLMotionKeyframeData *> aabb_ptrs;
+ aabb_ptrs.reserve(num_motion_steps);
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ MTLMotionKeyframeData *k = [MTLMotionKeyframeData data];
+ k.buffer = aabbBuf;
+ k.offset = step * num_segments * sizeof(MTLAxisAlignedBoundingBox);
+ aabb_ptrs.push_back(k);
+ }
+
+ MTLAccelerationStructureMotionBoundingBoxGeometryDescriptor *geomDescMotion =
+ [MTLAccelerationStructureMotionBoundingBoxGeometryDescriptor descriptor];
+ geomDescMotion.boundingBoxBuffers = [NSArray arrayWithObjects:aabb_ptrs.data()
+ count:aabb_ptrs.size()];
+ geomDescMotion.boundingBoxCount = num_segments;
+ geomDescMotion.boundingBoxStride = sizeof(aabb_data[0]);
+ geomDescMotion.intersectionFunctionTableOffset = 1;
+
+ /* Force a single any-hit call, so shadow record-all behavior works correctly */
+ /* (Match optix behaviour: unsigned int build_flags =
+ * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
+ geomDescMotion.allowDuplicateIntersectionFunctionInvocation = false;
+ geomDescMotion.opaque = true;
+ geomDesc = geomDescMotion;
+ }
+ else {
+ MTLAccelerationStructureBoundingBoxGeometryDescriptor *geomDescNoMotion =
+ [MTLAccelerationStructureBoundingBoxGeometryDescriptor descriptor];
+ geomDescNoMotion.boundingBoxBuffer = aabbBuf;
+ geomDescNoMotion.boundingBoxBufferOffset = 0;
+ geomDescNoMotion.boundingBoxCount = int(num_aabbs);
+ geomDescNoMotion.boundingBoxStride = sizeof(aabb_data[0]);
+ geomDescNoMotion.intersectionFunctionTableOffset = 1;
+
+ /* Force a single any-hit call, so shadow record-all behavior works correctly */
+ /* (Match optix behaviour: unsigned int build_flags =
+ * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
+ geomDescNoMotion.allowDuplicateIntersectionFunctionInvocation = false;
+ geomDescNoMotion.opaque = true;
+ geomDesc = geomDescNoMotion;
+ }
+
+ MTLPrimitiveAccelerationStructureDescriptor *accelDesc =
+ [MTLPrimitiveAccelerationStructureDescriptor descriptor];
+ accelDesc.geometryDescriptors = @[ geomDesc ];
+
+ if (motion_blur) {
+ accelDesc.motionStartTime = 0.0f;
+ accelDesc.motionEndTime = 1.0f;
+ accelDesc.motionStartBorderMode = MTLMotionBorderModeVanish;
+ accelDesc.motionEndBorderMode = MTLMotionBorderModeVanish;
+ accelDesc.motionKeyframeCount = num_motion_steps;
+ }
+
+ if (!use_fast_trace_bvh) {
+ accelDesc.usage |= (MTLAccelerationStructureUsageRefit |
+ MTLAccelerationStructureUsagePreferFastBuild);
+ }
+
+ MTLAccelerationStructureSizes accelSizes = [device
+ accelerationStructureSizesWithDescriptor:accelDesc];
+ id<MTLAccelerationStructure> accel_uncompressed = [device
+ newAccelerationStructureWithSize:accelSizes.accelerationStructureSize];
+ id<MTLBuffer> scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize
+ options:MTLResourceStorageModePrivate];
+ id<MTLBuffer> sizeBuf = [device newBufferWithLength:8 options:MTLResourceStorageModeShared];
+ id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+ id<MTLAccelerationStructureCommandEncoder> accelEnc =
+ [accelCommands accelerationStructureCommandEncoder];
+ if (refit) {
+ [accelEnc refitAccelerationStructure:accel_struct
+ descriptor:accelDesc
+ destination:accel_uncompressed
+ scratchBuffer:scratchBuf
+ scratchBufferOffset:0];
+ }
+ else {
+ [accelEnc buildAccelerationStructure:accel_uncompressed
+ descriptor:accelDesc
+ scratchBuffer:scratchBuf
+ scratchBufferOffset:0];
+ }
+ if (use_fast_trace_bvh) {
+ [accelEnc writeCompactedAccelerationStructureSize:accel_uncompressed
+ toBuffer:sizeBuf
+ offset:0
+ sizeDataType:MTLDataTypeULong];
+ }
+ [accelEnc endEncoding];
+ [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+ /* free temp resources */
+ [scratchBuf release];
+ [aabbBuf release];
+
+ if (use_fast_trace_bvh) {
+ /* Compact the accel structure */
+ uint64_t compressed_size = *(uint64_t *)sizeBuf.contents;
+
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+ id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+ id<MTLAccelerationStructureCommandEncoder> accelEnc =
+ [accelCommands accelerationStructureCommandEncoder];
+ id<MTLAccelerationStructure> accel = [device
+ newAccelerationStructureWithSize:compressed_size];
+ [accelEnc copyAndCompactAccelerationStructure:accel_uncompressed
+ toAccelerationStructure:accel];
+ [accelEnc endEncoding];
+ [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+ uint64_t allocated_size = [accel allocatedSize];
+ stats.mem_alloc(allocated_size);
+ accel_struct = accel;
+ [accel_uncompressed release];
+ accel_struct_building = false;
+ }];
+ [accelCommands commit];
+ });
+ }
+ else {
+ /* set our acceleration structure to the uncompressed structure */
+ accel_struct = accel_uncompressed;
+
+ uint64_t allocated_size = [accel_struct allocatedSize];
+ stats.mem_alloc(allocated_size);
+ accel_struct_building = false;
+ }
+ [sizeBuf release];
+ }];
+
+ accel_struct_building = true;
+ [accelCommands commit];
+ return true;
+ }
+ return false;
+}
+
+bool BVHMetal::build_BLAS(Progress &progress,
+ id<MTLDevice> device,
+ id<MTLCommandQueue> queue,
+ bool refit)
+{
+ if (@available(macos 12.0, *)) {
+ assert(objects.size() == 1 && geometry.size() == 1);
+
+ /* Build bottom level acceleration structures (BLAS) */
+ Geometry *const geom = geometry[0];
+ switch (geom->geometry_type) {
+ case Geometry::VOLUME:
+ case Geometry::MESH:
+ return build_BLAS_mesh(progress, device, queue, geom, refit);
+ case Geometry::HAIR:
+ return build_BLAS_hair(progress, device, queue, geom, refit);
+ default:
+ return false;
+ }
+ }
+ return false;
+}
+
+bool BVHMetal::build_TLAS(Progress &progress,
+ id<MTLDevice> device,
+ id<MTLCommandQueue> queue,
+ bool refit)
+{
+ if (@available(macos 12.0, *)) {
+
+ /* we need to sync here and ensure that all BLAS have completed async generation by both GCD
+ * and Metal */
+ {
+ __block bool complete_bvh = false;
+ while (!complete_bvh) {
+ dispatch_sync(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+ complete_bvh = true;
+ for (Object *ob : objects) {
+ /* Skip non-traceable objects */
+ if (!ob->is_traceable())
+ continue;
+
+ Geometry const *geom = ob->get_geometry();
+ BVHMetal const *blas = static_cast<BVHMetal const *>(geom->bvh);
+ if (blas->accel_struct_building) {
+ complete_bvh = false;
+
+ /* We're likely waiting on a command buffer that's in flight to complete.
+ * Queue up a command buffer and wait for it complete before checking the BLAS again
+ */
+ id<MTLCommandBuffer> command_buffer = [queue commandBuffer];
+ [command_buffer commit];
+ [command_buffer waitUntilCompleted];
+ break;
+ }
+ }
+ });
+ }
+ }
+
+ uint32_t num_instances = 0;
+ uint32_t num_motion_transforms = 0;
+ for (Object *ob : objects) {
+ /* Skip non-traceable objects */
+ if (!ob->is_traceable())
+ continue;
+ num_instances++;
+
+ if (ob->use_motion()) {
+ num_motion_transforms += max(1, ob->get_motion().size());
+ }
+ else {
+ num_motion_transforms++;
+ }
+ }
+
+ /*------------------------------------------------*/
+ BVH_status("Building TLAS | %7d instances", (int)num_instances);
+ /*------------------------------------------------*/
+
+ const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC);
+
+ NSMutableArray *all_blas = [NSMutableArray array];
+ unordered_map<BVHMetal const *, int> instance_mapping;
+
+ /* Lambda function to build/retrieve the BLAS index mapping */
+ auto get_blas_index = [&](BVHMetal const *blas) {
+ auto it = instance_mapping.find(blas);
+ if (it != instance_mapping.end()) {
+ return it->second;
+ }
+ else {
+ int blas_index = (int)[all_blas count];
+ instance_mapping[blas] = blas_index;
+ if (@available(macos 12.0, *)) {
+ [all_blas addObject:blas->accel_struct];
+ }
+ return blas_index;
+ }
+ };
+
+ MTLResourceOptions storage_mode;
+ if (device.hasUnifiedMemory) {
+ storage_mode = MTLResourceStorageModeShared;
+ }
+ else {
+ storage_mode = MTLResourceStorageModeManaged;
+ }
+
+ size_t instance_size;
+ if (motion_blur) {
+ instance_size = sizeof(MTLAccelerationStructureMotionInstanceDescriptor);
+ }
+ else {
+ instance_size = sizeof(MTLAccelerationStructureUserIDInstanceDescriptor);
+ }
+
+ /* Allocate a GPU buffer for the instance data and populate it */
+ id<MTLBuffer> instanceBuf = [device newBufferWithLength:num_instances * instance_size
+ options:storage_mode];
+ id<MTLBuffer> motion_transforms_buf = nil;
+ MTLPackedFloat4x3 *motion_transforms = nullptr;
+ if (motion_blur && num_motion_transforms) {
+ motion_transforms_buf = [device
+ newBufferWithLength:num_motion_transforms * sizeof(MTLPackedFloat4x3)
+ options:storage_mode];
+ motion_transforms = (MTLPackedFloat4x3 *)motion_transforms_buf.contents;
+ }
+
+ uint32_t instance_index = 0;
+ uint32_t motion_transform_index = 0;
+ for (Object *ob : objects) {
+ /* Skip non-traceable objects */
+ if (!ob->is_traceable())
+ continue;
+
+ Geometry const *geom = ob->get_geometry();
+
+ BVHMetal const *blas = static_cast<BVHMetal const *>(geom->bvh);
+ uint32_t accel_struct_index = get_blas_index(blas);
+
+ /* Add some of the object visibility bits to the mask.
+ * __prim_visibility contains the combined visibility bits of all instances, so is not
+ * reliable if they differ between instances.
+ *
+ * METAL_WIP: OptiX visibility mask can only contain 8 bits, so have to trade-off here
+ * and select just a few important ones.
+ */
+ uint32_t mask = ob->visibility_for_tracing() & 0xFF;
+
+ /* Have to have at least one bit in the mask, or else instance would always be culled. */
+ if (0 == mask) {
+ mask = 0xFF;
+ }
+
+ /* Set user instance ID to object index */
+ int object_index = ob->get_device_index();
+ uint32_t user_id = uint32_t(object_index);
+
+ /* Bake into the appropriate descriptor */
+ if (motion_blur) {
+ MTLAccelerationStructureMotionInstanceDescriptor *instances =
+ (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
+ MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
+
+ desc.accelerationStructureIndex = accel_struct_index;
+ desc.userID = user_id;
+ desc.mask = mask;
+ desc.motionStartTime = 0.0f;
+ desc.motionEndTime = 1.0f;
+ desc.motionTransformsStartIndex = motion_transform_index;
+ desc.motionStartBorderMode = MTLMotionBorderModeVanish;
+ desc.motionEndBorderMode = MTLMotionBorderModeVanish;
+ desc.intersectionFunctionTableOffset = 0;
+
+ int key_count = ob->get_motion().size();
+ if (key_count) {
+ desc.motionTransformsCount = key_count;
+
+ Transform *keys = ob->get_motion().data();
+ for (int i = 0; i < key_count; i++) {
+ float *t = (float *)&motion_transforms[motion_transform_index++];
+ /* Transpose transform */
+ auto src = (float const *)&keys[i];
+ for (int i = 0; i < 12; i++) {
+ t[i] = src[(i / 3) + 4 * (i % 3)];
+ }
+ }
+ }
+ else {
+ desc.motionTransformsCount = 1;
+
+ float *t = (float *)&motion_transforms[motion_transform_index++];
+ if (ob->get_geometry()->is_instanced()) {
+ /* Transpose transform */
+ auto src = (float const *)&ob->get_tfm();
+ for (int i = 0; i < 12; i++) {
+ t[i] = src[(i / 3) + 4 * (i % 3)];
+ }
+ }
+ else {
+ /* Clear transform to identity matrix */
+ t[0] = t[4] = t[8] = 1.0f;
+ }
+ }
+ }
+ else {
+ MTLAccelerationStructureUserIDInstanceDescriptor *instances =
+ (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
+ MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
+
+ desc.accelerationStructureIndex = accel_struct_index;
+ desc.userID = user_id;
+ desc.mask = mask;
+ desc.intersectionFunctionTableOffset = 0;
+
+ float *t = (float *)&desc.transformationMatrix;
+ if (ob->get_geometry()->is_instanced()) {
+ /* Transpose transform */
+ auto src = (float const *)&ob->get_tfm();
+ for (int i = 0; i < 12; i++) {
+ t[i] = src[(i / 3) + 4 * (i % 3)];
+ }
+ }
+ else {
+ /* Clear transform to identity matrix */
+ t[0] = t[4] = t[8] = 1.0f;
+ }
+ }
+ }
+
+ if (storage_mode == MTLResourceStorageModeManaged) {
+ [instanceBuf didModifyRange:NSMakeRange(0, instanceBuf.length)];
+ if (motion_transforms_buf) {
+ [motion_transforms_buf didModifyRange:NSMakeRange(0, motion_transforms_buf.length)];
+ assert(num_motion_transforms == motion_transform_index);
+ }
+ }
+
+ MTLInstanceAccelerationStructureDescriptor *accelDesc =
+ [MTLInstanceAccelerationStructureDescriptor descriptor];
+ accelDesc.instanceCount = num_instances;
+ accelDesc.instanceDescriptorType = MTLAccelerationStructureInstanceDescriptorTypeUserID;
+ accelDesc.instanceDescriptorBuffer = instanceBuf;
+ accelDesc.instanceDescriptorBufferOffset = 0;
+ accelDesc.instanceDescriptorStride = instance_size;
+ accelDesc.instancedAccelerationStructures = all_blas;
+
+ if (motion_blur) {
+ accelDesc.instanceDescriptorType = MTLAccelerationStructureInstanceDescriptorTypeMotion;
+ accelDesc.motionTransformBuffer = motion_transforms_buf;
+ accelDesc.motionTransformCount = num_motion_transforms;
+ }
+
+ if (!use_fast_trace_bvh) {
+ accelDesc.usage |= (MTLAccelerationStructureUsageRefit |
+ MTLAccelerationStructureUsagePreferFastBuild);
+ }
+
+ MTLAccelerationStructureSizes accelSizes = [device
+ accelerationStructureSizesWithDescriptor:accelDesc];
+ id<MTLAccelerationStructure> accel = [device
+ newAccelerationStructureWithSize:accelSizes.accelerationStructureSize];
+ id<MTLBuffer> scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize
+ options:MTLResourceStorageModePrivate];
+ id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+ id<MTLAccelerationStructureCommandEncoder> accelEnc =
+ [accelCommands accelerationStructureCommandEncoder];
+ if (refit) {
+ [accelEnc refitAccelerationStructure:accel_struct
+ descriptor:accelDesc
+ destination:accel
+ scratchBuffer:scratchBuf
+ scratchBufferOffset:0];
+ }
+ else {
+ [accelEnc buildAccelerationStructure:accel
+ descriptor:accelDesc
+ scratchBuffer:scratchBuf
+ scratchBufferOffset:0];
+ }
+ [accelEnc endEncoding];
+ [accelCommands commit];
+ [accelCommands waitUntilCompleted];
+
+ if (motion_transforms_buf) {
+ [motion_transforms_buf release];
+ }
+ [instanceBuf release];
+ [scratchBuf release];
+
+ uint64_t allocated_size = [accel allocatedSize];
+ stats.mem_alloc(allocated_size);
+
+ /* Cache top and bottom-level acceleration structs */
+ accel_struct = accel;
+ blas_array.clear();
+ blas_array.reserve(all_blas.count);
+ for (id<MTLAccelerationStructure> blas in all_blas) {
+ blas_array.push_back(blas);
+ }
+
+ return true;
+ }
+ return false;
+}
+
+bool BVHMetal::build(Progress &progress,
+ id<MTLDevice> device,
+ id<MTLCommandQueue> queue,
+ bool refit)
+{
+ if (@available(macos 12.0, *)) {
+ if (refit && params.bvh_type != BVH_TYPE_STATIC) {
+ assert(accel_struct);
+ }
+ else {
+ if (accel_struct) {
+ stats.mem_free(accel_struct.allocatedSize);
+ [accel_struct release];
+ accel_struct = nil;
+ }
+ }
+ }
+
+ if (!params.top_level) {
+ return build_BLAS(progress, device, queue, refit);
+ }
+ else {
+ return build_TLAS(progress, device, queue, refit);
+ }
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
diff --git a/intern/cycles/device/metal/device.h b/intern/cycles/device/metal/device.h
new file mode 100644
index 00000000000..254fbfee42b
--- /dev/null
+++ b/intern/cycles/device/metal/device.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/string.h"
+#include "util/vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_metal_init();
+
+Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_metal_info(vector<DeviceInfo> &devices);
+
+string device_metal_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/metal/device.mm b/intern/cycles/device/metal/device.mm
new file mode 100644
index 00000000000..bc893adea17
--- /dev/null
+++ b/intern/cycles/device/metal/device.mm
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+# include "device/metal/device.h"
+# include "device/metal/device_impl.h"
+
+#endif
+
+#include "util/debug.h"
+#include "util/set.h"
+#include "util/system.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_METAL
+
+Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+ return new MetalDevice(info, stats, profiler);
+}
+
+bool device_metal_init()
+{
+ return true;
+}
+
+static int device_metal_get_num_devices_safe(uint32_t *num_devices)
+{
+ *num_devices = MTLCopyAllDevices().count;
+ return 0;
+}
+
+void device_metal_info(vector<DeviceInfo> &devices)
+{
+ uint32_t num_devices = 0;
+ device_metal_get_num_devices_safe(&num_devices);
+ if (num_devices == 0) {
+ return;
+ }
+
+ vector<MetalPlatformDevice> usable_devices;
+ MetalInfo::get_usable_devices(&usable_devices);
+ /* Devices are numbered consecutively across platforms. */
+ set<string> unique_ids;
+ int device_index = 0;
+ for (MetalPlatformDevice &device : usable_devices) {
+ /* Compute unique ID for persistent user preferences. */
+ const string &device_name = device.device_name;
+ string id = string("METAL_") + device_name;
+
+ /* Hardware ID might not be unique, add device number in that case. */
+ if (unique_ids.find(id) != unique_ids.end()) {
+ id += string_printf("_ID_%d", num_devices);
+ }
+ unique_ids.insert(id);
+
+ /* Create DeviceInfo. */
+ DeviceInfo info;
+ info.type = DEVICE_METAL;
+ info.description = string_remove_trademark(string(device_name));
+
+ /* Ensure unique naming on Apple Silicon / SoC devices which return the same string for CPU and
+ * GPU */
+ if (info.description == system_cpu_brand_string()) {
+ info.description += " (GPU)";
+ }
+
+ info.num = device_index;
+ /* We don't know if it's used for display, but assume it is. */
+ info.display_device = true;
+ info.denoisers = DENOISER_NONE;
+ info.id = id;
+
+ devices.push_back(info);
+ device_index++;
+ }
+}
+
+string device_metal_capabilities()
+{
+ string result = "";
+ string error_msg = "";
+ uint32_t num_devices = 0;
+ assert(device_metal_get_num_devices_safe(&num_devices));
+ if (num_devices == 0) {
+ return "No Metal devices found\n";
+ }
+ result += string_printf("Number of devices: %u\n", num_devices);
+
+ NSArray<id<MTLDevice>> *allDevices = MTLCopyAllDevices();
+ for (id<MTLDevice> device in allDevices) {
+ result += string_printf("\t\tDevice: %s\n", [device.name UTF8String]);
+ }
+
+ return result;
+}
+
+#else
+
+Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+ return nullptr;
+}
+
+bool device_metal_init()
+{
+ return false;
+}
+
+void device_metal_info(vector<DeviceInfo> &devices)
+{
+}
+
+string device_metal_capabilities()
+{
+ return "";
+}
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
new file mode 100644
index 00000000000..a420a3ba704
--- /dev/null
+++ b/intern/cycles/device/metal/device_impl.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_METAL
+
+# include "bvh/bvh.h"
+# include "device/device.h"
+# include "device/metal/bvh.h"
+# include "device/metal/device.h"
+# include "device/metal/kernel.h"
+# include "device/metal/queue.h"
+# include "device/metal/util.h"
+
+# include <Metal/Metal.h>
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class MetalDevice : public Device {
+ public:
+ id<MTLDevice> mtlDevice = nil;
+ id<MTLLibrary> mtlLibrary[PSO_NUM] = {nil};
+ id<MTLArgumentEncoder> mtlBufferKernelParamsEncoder =
+ nil; /* encoder used for fetching device pointers from MTLBuffers */
+ id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
+ id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
+ nil; /* encoder used for fetching device pointers from MTLBuffers */
+ string source_used_for_compile[PSO_NUM];
+
+ KernelParamsMetal launch_params = {0};
+
+ /* MetalRT members ----------------------------------*/
+ BVHMetal *bvhMetalRT = nullptr;
+ bool motion_blur = false;
+ id<MTLArgumentEncoder> mtlASArgEncoder =
+ nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */
+ /*---------------------------------------------------*/
+
+ string device_name;
+ MetalGPUVendor device_vendor;
+
+ uint kernel_features;
+ MTLResourceOptions default_storage_mode;
+ int max_threads_per_threadgroup;
+
+ int mtlDevId = 0;
+ bool first_error = true;
+
+ struct MetalMem {
+ device_memory *mem = nullptr;
+ int pointer_index = -1;
+ id<MTLBuffer> mtlBuffer = nil;
+ id<MTLTexture> mtlTexture = nil;
+ uint64_t offset = 0;
+ uint64_t size = 0;
+ void *hostPtr = nullptr;
+ bool use_UMA = false; /* If true, UMA memory in shared_pointer is being used. */
+ };
+ typedef map<device_memory *, unique_ptr<MetalMem>> MetalMemMap;
+ MetalMemMap metal_mem_map;
+ std::vector<id<MTLResource>> delayed_free_list;
+ std::recursive_mutex metal_mem_map_mutex;
+
+ /* Bindless Textures */
+ device_vector<TextureInfo> texture_info;
+ bool need_texture_info;
+ id<MTLArgumentEncoder> mtlTextureArgEncoder = nil;
+ id<MTLBuffer> texture_bindings_2d = nil;
+ id<MTLBuffer> texture_bindings_3d = nil;
+ std::vector<id<MTLTexture>> texture_slot_map;
+
+ MetalDeviceKernels kernels;
+ bool use_metalrt = false;
+ bool use_function_specialisation = false;
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+ void set_error(const string &error) override;
+
+ MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+ virtual ~MetalDevice();
+
+ bool support_device(const uint /*kernel_features*/);
+
+ bool check_peer_access(Device *peer_device) override;
+
+ bool use_adaptive_compilation();
+
+ string get_source(const uint kernel_features);
+
+ string compile_kernel(const uint kernel_features, const char *name);
+
+ virtual bool load_kernels(const uint kernel_features) override;
+
+ void reserve_local_memory(const uint kernel_features);
+
+ void init_host_memory();
+
+ void load_texture_info();
+
+ virtual bool should_use_graphics_interop() override;
+
+ virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+ virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+ /* ------------------------------------------------------------------ */
+ /* low-level memory management */
+
+ MetalMem *generic_alloc(device_memory &mem);
+
+ void generic_copy_to(device_memory &mem);
+
+ void generic_free(device_memory &mem);
+
+ void mem_alloc(device_memory &mem) override;
+
+ void mem_copy_to(device_memory &mem) override;
+
+ void mem_copy_from(device_memory &mem)
+ {
+ mem_copy_from(mem, -1, -1, -1, -1);
+ }
+ void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
+
+ void mem_zero(device_memory &mem) override;
+
+ void mem_free(device_memory &mem) override;
+
+ device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
+
+ virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+ void global_alloc(device_memory &mem);
+
+ void global_free(device_memory &mem);
+
+ void tex_alloc(device_texture &mem);
+
+ void tex_alloc_as_buffer(device_texture &mem);
+
+ void tex_free(device_texture &mem);
+
+ void flush_delayed_free_list();
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
new file mode 100644
index 00000000000..69a08ca07d1
--- /dev/null
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -0,0 +1,1008 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+# include "device/metal/device_impl.h"
+# include "device/metal/device.h"
+
+# include "util/debug.h"
+# include "util/md5.h"
+# include "util/path.h"
+
+CCL_NAMESPACE_BEGIN
+
+class MetalDevice;
+
+BVHLayoutMask MetalDevice::get_bvh_layout_mask() const
+{
+ return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2;
+}
+
+void MetalDevice::set_error(const string &error)
+{
+ static std::mutex s_error_mutex;
+ std::lock_guard<std::mutex> lock(s_error_mutex);
+
+ Device::set_error(error);
+
+ if (first_error) {
+ fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+ fprintf(stderr,
+ "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+ first_error = false;
+ }
+}
+
+MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+ : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+ mtlDevId = info.num;
+
+ /* select chosen device */
+ vector<MetalPlatformDevice> usable_devices;
+ MetalInfo::get_usable_devices(&usable_devices);
+ if (usable_devices.size() == 0) {
+ set_error("Metal: no devices found.");
+ return;
+ }
+ assert(mtlDevId < usable_devices.size());
+ MetalPlatformDevice &platform_device = usable_devices[mtlDevId];
+ mtlDevice = platform_device.device_id;
+ device_name = platform_device.device_name;
+ device_vendor = MetalInfo::get_vendor_from_device_name(device_name);
+ assert(device_vendor != METAL_GPU_UNKNOWN);
+ metal_printf("Creating new Cycles device for Metal: %s\n", device_name.c_str());
+
+ /* determine default storage mode based on whether UMA is supported */
+
+ default_storage_mode = MTLResourceStorageModeManaged;
+
+ if (@available(macos 11.0, *)) {
+ if ([mtlDevice hasUnifiedMemory]) {
+ default_storage_mode = MTLResourceStorageModeShared;
+ init_host_memory();
+ }
+ }
+
+ texture_bindings_2d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
+ texture_bindings_3d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
+
+ stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
+
+ switch (device_vendor) {
+ default:
+ break;
+ case METAL_GPU_INTEL: {
+ use_metalrt = false;
+ max_threads_per_threadgroup = 64;
+ break;
+ }
+ case METAL_GPU_AMD: {
+ use_metalrt = false;
+ max_threads_per_threadgroup = 128;
+ break;
+ }
+ case METAL_GPU_APPLE: {
+ use_metalrt = true;
+ max_threads_per_threadgroup = 512;
+ break;
+ }
+ }
+
+ if (auto metalrt = getenv("CYCLES_METALRT")) {
+ use_metalrt = (atoi(metalrt) != 0);
+ }
+
+ MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
+ arg_desc_params.dataType = MTLDataTypePointer;
+ arg_desc_params.access = MTLArgumentAccessReadOnly;
+ arg_desc_params.arrayLength = sizeof(KernelParamsMetal) / sizeof(device_ptr);
+ mtlBufferKernelParamsEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_params ]];
+
+ MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc] init];
+ arg_desc_texture.dataType = MTLDataTypeTexture;
+ arg_desc_texture.access = MTLArgumentAccessReadOnly;
+ mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
+
+ /* command queue for non-tracing work on the GPU */
+ mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
+
+ /* Acceleration structure arg encoder, if needed */
+ if (@available(macos 12.0, *)) {
+ if (use_metalrt) {
+ MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
+ arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
+ arg_desc_as.access = MTLArgumentAccessReadOnly;
+ mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]];
+ [arg_desc_as release];
+ }
+ }
+
+ /* Build the arg encoder for the ancillary bindings */
+ {
+ NSMutableArray *ancillary_desc = [[NSMutableArray alloc] init];
+
+ int index = 0;
+ MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc] init];
+ arg_desc_tex.dataType = MTLDataTypePointer;
+ arg_desc_tex.access = MTLArgumentAccessReadOnly;
+
+ arg_desc_tex.index = index++;
+ [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
+ arg_desc_tex.index = index++;
+ [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_3d */
+
+ [arg_desc_tex release];
+
+ if (@available(macos 12.0, *)) {
+ if (use_metalrt) {
+ MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
+ arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
+ arg_desc_as.access = MTLArgumentAccessReadOnly;
+
+ MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
+ arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
+ arg_desc_ift.access = MTLArgumentAccessReadOnly;
+
+ arg_desc_as.index = index++;
+ [ancillary_desc addObject:[arg_desc_as copy]]; /* accel_struct */
+ arg_desc_ift.index = index++;
+ [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_default */
+ arg_desc_ift.index = index++;
+ [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
+ arg_desc_ift.index = index++;
+ [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
+
+ [arg_desc_ift release];
+ [arg_desc_as release];
+ }
+ }
+
+ mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
+
+ for (int i = 0; i < ancillary_desc.count; i++) {
+ [ancillary_desc[i] release];
+ }
+ [ancillary_desc release];
+ }
+ [arg_desc_params release];
+ [arg_desc_texture release];
+}
+
+MetalDevice::~MetalDevice()
+{
+ for (auto &tex : texture_slot_map) {
+ if (tex) {
+ [tex release];
+ tex = nil;
+ }
+ }
+ flush_delayed_free_list();
+
+ if (texture_bindings_2d) {
+ stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
+
+ [texture_bindings_2d release];
+ [texture_bindings_3d release];
+ }
+ [mtlTextureArgEncoder release];
+ [mtlBufferKernelParamsEncoder release];
+ [mtlASArgEncoder release];
+ [mtlAncillaryArgEncoder release];
+ [mtlGeneralCommandQueue release];
+ [mtlDevice release];
+
+ texture_info.free();
+}
+
+bool MetalDevice::support_device(const uint kernel_features /*requested_features*/)
+{
+ return true;
+}
+
+bool MetalDevice::check_peer_access(Device *peer_device)
+{
+ assert(0);
+ /* does peer access make sense? */
+ return false;
+}
+
+bool MetalDevice::use_adaptive_compilation()
+{
+ return DebugFlags().metal.adaptive_compile;
+}
+
+string MetalDevice::get_source(const uint kernel_features)
+{
+ string build_options;
+
+ if (use_adaptive_compilation()) {
+ build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+ }
+
+ if (use_metalrt) {
+ build_options += "-D__METALRT__ ";
+ if (motion_blur) {
+ build_options += "-D__METALRT_MOTION__ ";
+ }
+ }
+
+# ifdef WITH_CYCLES_DEBUG
+ build_options += "-D__KERNEL_DEBUG__ ";
+# endif
+
+ switch (device_vendor) {
+ default:
+ break;
+ case METAL_GPU_INTEL:
+ build_options += "-D__KERNEL_METAL_INTEL__ ";
+ break;
+ case METAL_GPU_AMD:
+ build_options += "-D__KERNEL_METAL_AMD__ ";
+ break;
+ case METAL_GPU_APPLE:
+ build_options += "-D__KERNEL_METAL_APPLE__ ";
+ break;
+ }
+
+ /* reformat -D defines list into compilable form */
+ vector<string> components;
+ string_replace(build_options, "-D", "");
+ string_split(components, build_options, " ");
+
+ string globalDefines;
+ for (const string &component : components) {
+ vector<string> assignments;
+ string_split(assignments, component, "=");
+ if (assignments.size() == 2)
+ globalDefines += string_printf(
+ "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str());
+ else
+ globalDefines += string_printf("#define %s\n", assignments[0].c_str());
+ }
+
+ string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n";
+ source = path_source_replace_includes(source, path_get("source"));
+
+ metal_printf("Global defines:\n%s\n", globalDefines.c_str());
+
+ return source;
+}
+
+bool MetalDevice::load_kernels(const uint _kernel_features)
+{
+ kernel_features = _kernel_features;
+
+ /* check if GPU is supported */
+ if (!support_device(kernel_features))
+ return false;
+
+ /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
+ * This is necessary since objects may be reported to have motion if the Vector pass is
+ * active, but may still need to be rendered without motion blur if that isn't active as well. */
+ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
+
+ NSError *error = NULL;
+
+ for (int i = 0; i < PSO_NUM; i++) {
+ if (mtlLibrary[i]) {
+ [mtlLibrary[i] release];
+ mtlLibrary[i] = nil;
+ }
+ }
+
+ MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
+
+ options.fastMathEnabled = YES;
+ options.languageVersion = MTLLanguageVersion2_1;
+
+ if (@available(macOS 11.0, *)) {
+ options.languageVersion = MTLLanguageVersion2_3;
+ }
+ if (@available(macOS 12.0, *)) {
+ options.languageVersion = MTLLanguageVersion2_4;
+ }
+
+ string metalsrc;
+
+ /* local helper: dump source to disk and return filepath */
+ auto dump_source = [&](int kernel_type) -> string {
+ string &source = source_used_for_compile[kernel_type];
+ string metalsrc = path_cache_get(path_join("kernels",
+ string_printf("%s.%s.metal",
+ kernel_type_as_string(kernel_type),
+ util_md5_string(source).c_str())));
+ path_write_text(metalsrc, source);
+ return metalsrc;
+ };
+
+ /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor,
+ * then compile it into a MTLLibrary */
+ auto fetch_and_compile_source = [&](int kernel_type) {
+ /* record the source uesd to compile this library, for hash building later */
+ string &source = source_used_for_compile[kernel_type];
+
+ switch (kernel_type) {
+ case PSO_GENERIC: {
+ source = get_source(kernel_features);
+ break;
+ }
+ case PSO_SPECIALISED: {
+ /* PSO_SPECIALISED derives from PSO_GENERIC */
+ string &generic_source = source_used_for_compile[PSO_GENERIC];
+ if (generic_source.empty()) {
+ generic_source = get_source(kernel_features);
+ }
+ source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source;
+ break;
+ }
+ default:
+ assert(0);
+ }
+
+ /* create MTLLibrary (front-end compilation) */
+ mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str())
+ options:options
+ error:&error];
+
+ bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr);
+
+ if (!mtlLibrary[kernel_type] || do_source_dump) {
+ string metalsrc = dump_source(kernel_type);
+
+ if (!mtlLibrary[kernel_type]) {
+ NSString *err = [error localizedDescription];
+ set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
+
+ return false;
+ }
+ }
+ return true;
+ };
+
+ fetch_and_compile_source(PSO_GENERIC);
+
+ if (use_function_specialisation) {
+ fetch_and_compile_source(PSO_SPECIALISED);
+ }
+
+ metal_printf("Front-end compilation finished\n");
+
+ bool result = kernels.load(this, PSO_GENERIC);
+
+ [options release];
+ reserve_local_memory(kernel_features);
+
+ return result;
+}
+
+void MetalDevice::reserve_local_memory(const uint kernel_features)
+{
+ /* METAL_WIP - implement this */
+}
+
+void MetalDevice::init_host_memory()
+{
+ /* METAL_WIP - implement this */
+}
+
+void MetalDevice::load_texture_info()
+{
+ if (need_texture_info) {
+ /* Unset flag before copying. */
+ need_texture_info = false;
+ texture_info.copy_to_device();
+
+ int num_textures = texture_info.size();
+
+ for (int tex = 0; tex < num_textures; tex++) {
+ uint64_t offset = tex * sizeof(void *);
+
+ id<MTLTexture> metal_texture = texture_slot_map[tex];
+ if (!metal_texture) {
+ [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
+ [mtlTextureArgEncoder setTexture:nil atIndex:0];
+ [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
+ [mtlTextureArgEncoder setTexture:nil atIndex:0];
+ }
+ else {
+ MTLTextureType type = metal_texture.textureType;
+ [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
+ [mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
+ [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
+ [mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
+ }
+ }
+ if (default_storage_mode == MTLResourceStorageModeManaged) {
+ [texture_bindings_2d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
+ [texture_bindings_3d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
+ }
+ }
+}
+
+MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
+{
+ size_t size = mem.memory_size();
+
+ mem.device_pointer = 0;
+
+ id<MTLBuffer> metal_buffer = nil;
+ if (size > 0) {
+ MTLResourceOptions options = default_storage_mode;
+ if (mem.type == MEM_DEVICE_ONLY) {
+ options = MTLResourceStorageModePrivate;
+ }
+
+ metal_buffer = [mtlDevice newBufferWithLength:size options:options];
+
+ if (!metal_buffer) {
+ set_error("System is out of GPU memory");
+ return nullptr;
+ }
+ }
+
+ if (mem.name) {
+ VLOG(2) << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
+ mem.device_size = metal_buffer.allocatedSize;
+ stats.mem_alloc(mem.device_size);
+
+ metal_buffer.label = [[NSString alloc] initWithFormat:@"%s", mem.name];
+
+ std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+
+ assert(metal_mem_map.count(&mem) == 0); /* assert against double-alloc */
+ MetalMem *mmem = new MetalMem;
+ metal_mem_map[&mem] = std::unique_ptr<MetalMem>(mmem);
+
+ mmem->mem = &mem;
+ mmem->mtlBuffer = metal_buffer;
+ mmem->offset = 0;
+ mmem->size = size;
+ if (mem.type != MEM_DEVICE_ONLY) {
+ mmem->hostPtr = [metal_buffer contents];
+ }
+ else {
+ mmem->hostPtr = nullptr;
+ }
+
+ /* encode device_pointer as (MetalMem*) in order to handle resource relocation and device pointer
+ * recalculation */
+ mem.device_pointer = device_ptr(mmem);
+
+ if (metal_buffer.storageMode == MTLResourceStorageModeShared) {
+ /* Replace host pointer with our host allocation. */
+
+ if (mem.host_pointer && mem.host_pointer != mmem->hostPtr) {
+ memcpy(mmem->hostPtr, mem.host_pointer, size);
+
+ mem.host_free();
+ mem.host_pointer = mmem->hostPtr;
+ }
+ mem.shared_pointer = mmem->hostPtr;
+ mem.shared_counter++;
+ mmem->use_UMA = true;
+ }
+ else {
+ mmem->use_UMA = false;
+ }
+
+ return mmem;
+}
+
+void MetalDevice::generic_copy_to(device_memory &mem)
+{
+ if (!mem.host_pointer || !mem.device_pointer) {
+ return;
+ }
+
+ std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+ if (!metal_mem_map.at(&mem)->use_UMA || mem.host_pointer != mem.shared_pointer) {
+ MetalMem &mmem = *metal_mem_map.at(&mem);
+ memcpy(mmem.hostPtr, mem.host_pointer, mem.memory_size());
+ if (mmem.mtlBuffer.storageMode == MTLStorageModeManaged) {
+ [mmem.mtlBuffer didModifyRange:NSMakeRange(0, mem.memory_size())];
+ }
+ }
+}
+
+void MetalDevice::generic_free(device_memory &mem)
+{
+ if (mem.device_pointer) {
+ std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+ MetalMem &mmem = *metal_mem_map.at(&mem);
+ size_t size = mmem.size;
+
+ /* If mmem.use_uma is true, reference counting is used
+ * to safely free memory. */
+
+ bool free_mtlBuffer = false;
+
+ if (mmem.use_UMA) {
+ assert(mem.shared_pointer);
+ if (mem.shared_pointer) {
+ assert(mem.shared_counter > 0);
+ if (--mem.shared_counter == 0) {
+ free_mtlBuffer = true;
+ }
+ }
+ }
+ else {
+ free_mtlBuffer = true;
+ }
+
+ if (free_mtlBuffer) {
+ if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) {
+ /* Safely move the device-side data back to the host before it is freed. */
+ mem.host_pointer = mem.host_alloc(size);
+ memcpy(mem.host_pointer, mem.shared_pointer, size);
+ mmem.use_UMA = false;
+ }
+
+ mem.shared_pointer = 0;
+
+ /* Free device memory. */
+ delayed_free_list.push_back(mmem.mtlBuffer);
+ mmem.mtlBuffer = nil;
+ }
+
+ stats.mem_free(mem.device_size);
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ metal_mem_map.erase(&mem);
+ }
+}
+
+void MetalDevice::mem_alloc(device_memory &mem)
+{
+ if (mem.type == MEM_TEXTURE) {
+ assert(!"mem_alloc not supported for textures.");
+ }
+ else if (mem.type == MEM_GLOBAL) {
+ generic_alloc(mem);
+ }
+ else {
+ generic_alloc(mem);
+ }
+}
+
+void MetalDevice::mem_copy_to(device_memory &mem)
+{
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ global_alloc(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ tex_alloc((device_texture &)mem);
+ }
+ else {
+ if (!mem.device_pointer) {
+ generic_alloc(mem);
+ }
+ generic_copy_to(mem);
+ }
+}
+
+void MetalDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
+{
+ if (mem.host_pointer) {
+
+ bool subcopy = (w >= 0 && h >= 0);
+ const size_t size = subcopy ? (elem * w * h) : mem.memory_size();
+ const size_t offset = subcopy ? (elem * y * w) : 0;
+
+ if (mem.device_pointer) {
+ std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+ MetalMem &mmem = *metal_mem_map.at(&mem);
+
+ if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
+
+ id<MTLCommandBuffer> cmdBuffer = [mtlGeneralCommandQueue commandBuffer];
+ id<MTLBlitCommandEncoder> blitEncoder = [cmdBuffer blitCommandEncoder];
+ [blitEncoder synchronizeResource:mmem.mtlBuffer];
+ [blitEncoder endEncoding];
+ [cmdBuffer commit];
+ [cmdBuffer waitUntilCompleted];
+ }
+
+ if (mem.host_pointer != mmem.hostPtr) {
+ memcpy((uchar *)mem.host_pointer + offset, (uchar *)mmem.hostPtr + offset, size);
+ }
+ }
+ else {
+ memset((char *)mem.host_pointer + offset, 0, size);
+ }
+ }
+}
+
+void MetalDevice::mem_zero(device_memory &mem)
+{
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+ if (!mem.device_pointer) {
+ return;
+ }
+
+ size_t size = mem.memory_size();
+ std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+ MetalMem &mmem = *metal_mem_map.at(&mem);
+ memset(mmem.hostPtr, 0, size);
+ if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
+ [mmem.mtlBuffer didModifyRange:NSMakeRange(0, size)];
+ }
+}
+
+void MetalDevice::mem_free(device_memory &mem)
+{
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ }
+ else {
+ generic_free(mem);
+ }
+}
+
+device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
+{
+ /* METAL_WIP - revive if necessary */
+ assert(0);
+ return 0;
+}
+
+void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+ if (strcmp(name, "__data") == 0) {
+ assert(size == sizeof(KernelData));
+ memcpy((uint8_t *)&launch_params + offsetof(KernelParamsMetal, data), host, size);
+ return;
+ }
+
+ auto update_launch_pointers =
+ [&](size_t offset, void *data, size_t data_size, size_t pointers_size) {
+ memcpy((uint8_t *)&launch_params + offset, data, data_size);
+
+ MetalMem **mmem = (MetalMem **)data;
+ int pointer_count = pointers_size / sizeof(device_ptr);
+ int pointer_index = offset / sizeof(device_ptr);
+ for (int i = 0; i < pointer_count; i++) {
+ if (mmem[i]) {
+ mmem[i]->pointer_index = pointer_index + i;
+ }
+ }
+ };
+
+ /* Update data storage pointers in launch parameters. */
+ if (strcmp(name, "__integrator_state") == 0) {
+ /* IntegratorStateGPU is contiguous pointers */
+ const size_t pointer_block_size = sizeof(IntegratorStateGPU);
+ update_launch_pointers(
+ offsetof(KernelParamsMetal, __integrator_state), host, size, pointer_block_size);
+ }
+# define KERNEL_TEX(data_type, tex_name) \
+ else if (strcmp(name, #tex_name) == 0) \
+ { \
+ update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
+ }
+# include "kernel/textures.h"
+# undef KERNEL_TEX
+}
+
+void MetalDevice::global_alloc(device_memory &mem)
+{
+ if (mem.is_resident(this)) {
+ generic_alloc(mem);
+ generic_copy_to(mem);
+ }
+
+ const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void MetalDevice::global_free(device_memory &mem)
+{
+ if (mem.is_resident(this) && mem.device_pointer) {
+ generic_free(mem);
+ }
+}
+
+void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
+{
+ generic_alloc(mem);
+ generic_copy_to(mem);
+
+ /* Resize once */
+ const uint slot = mem.slot;
+ if (slot >= texture_info.size()) {
+ /* Allocate some slots in advance, to reduce amount
+ * of re-allocations. */
+ texture_info.resize(round_up(slot + 1, 128));
+ }
+
+ mem.info.data = (uint64_t)mem.device_pointer;
+
+ /* Set Mapping and tag that we need to (re-)upload to device */
+ texture_info[slot] = mem.info;
+ need_texture_info = true;
+}
+
+void MetalDevice::tex_alloc(device_texture &mem)
+{
+ MTLStorageMode storage_mode = MTLStorageModeManaged;
+ if (@available(macos 10.15, *)) {
+ if ([mtlDevice hasUnifiedMemory] &&
+ device_vendor !=
+ METAL_GPU_INTEL) { /* Intel GPUs don't support MTLStorageModeShared for MTLTextures */
+ storage_mode = MTLStorageModeShared;
+ }
+ }
+
+ /* General variables for both architectures */
+ string bind_name = mem.name;
+ size_t dsize = datatype_size(mem.data_type);
+ size_t size = mem.memory_size();
+
+ /* sampler_index maps into the GPU's constant 'metal_samplers' array */
+ uint64_t sampler_index = mem.info.extension;
+ if (mem.info.interpolation != INTERPOLATION_CLOSEST) {
+ sampler_index += 3;
+ }
+
+ /* Image Texture Storage */
+ MTLPixelFormat format;
+ switch (mem.data_type) {
+ case TYPE_UCHAR: {
+ MTLPixelFormat formats[] = {MTLPixelFormatR8Unorm,
+ MTLPixelFormatRG8Unorm,
+ MTLPixelFormatInvalid,
+ MTLPixelFormatRGBA8Unorm};
+ format = formats[mem.data_elements - 1];
+ } break;
+ case TYPE_UINT16: {
+ MTLPixelFormat formats[] = {MTLPixelFormatR16Unorm,
+ MTLPixelFormatRG16Unorm,
+ MTLPixelFormatInvalid,
+ MTLPixelFormatRGBA16Unorm};
+ format = formats[mem.data_elements - 1];
+ } break;
+ case TYPE_UINT: {
+ MTLPixelFormat formats[] = {MTLPixelFormatR32Uint,
+ MTLPixelFormatRG32Uint,
+ MTLPixelFormatInvalid,
+ MTLPixelFormatRGBA32Uint};
+ format = formats[mem.data_elements - 1];
+ } break;
+ case TYPE_INT: {
+ MTLPixelFormat formats[] = {MTLPixelFormatR32Sint,
+ MTLPixelFormatRG32Sint,
+ MTLPixelFormatInvalid,
+ MTLPixelFormatRGBA32Sint};
+ format = formats[mem.data_elements - 1];
+ } break;
+ case TYPE_FLOAT: {
+ MTLPixelFormat formats[] = {MTLPixelFormatR32Float,
+ MTLPixelFormatRG32Float,
+ MTLPixelFormatInvalid,
+ MTLPixelFormatRGBA32Float};
+ format = formats[mem.data_elements - 1];
+ } break;
+ case TYPE_HALF: {
+ MTLPixelFormat formats[] = {MTLPixelFormatR16Float,
+ MTLPixelFormatRG16Float,
+ MTLPixelFormatInvalid,
+ MTLPixelFormatRGBA16Float};
+ format = formats[mem.data_elements - 1];
+ } break;
+ default:
+ assert(0);
+ return;
+ }
+
+ assert(format != MTLPixelFormatInvalid);
+
+ id<MTLTexture> mtlTexture = nil;
+ size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+
+ if (mem.data_depth > 1) {
+ /* 3D texture using array */
+ MTLTextureDescriptor *desc;
+
+ desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
+ width:mem.data_width
+ height:mem.data_height
+ mipmapped:NO];
+
+ desc.storageMode = storage_mode;
+ desc.usage = MTLTextureUsageShaderRead;
+
+ desc.textureType = MTLTextureType3D;
+ desc.depth = mem.data_depth;
+
+ VLOG(2) << "Texture 3D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
+ assert(mtlTexture);
+
+ if (!mtlTexture) {
+ return;
+ }
+
+ const size_t imageBytes = src_pitch * mem.data_height;
+ for (size_t d = 0; d < mem.data_depth; d++) {
+ const size_t offset = d * imageBytes;
+ [mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1)
+ mipmapLevel:0
+ slice:0
+ withBytes:(uint8_t *)mem.host_pointer + offset
+ bytesPerRow:src_pitch
+ bytesPerImage:0];
+ }
+ }
+ else if (mem.data_height > 0) {
+ /* 2D texture */
+ MTLTextureDescriptor *desc;
+
+ desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
+ width:mem.data_width
+ height:mem.data_height
+ mipmapped:NO];
+
+ desc.storageMode = storage_mode;
+ desc.usage = MTLTextureUsageShaderRead;
+
+ VLOG(2) << "Texture 2D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
+ assert(mtlTexture);
+
+ [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
+ mipmapLevel:0
+ withBytes:mem.host_pointer
+ bytesPerRow:src_pitch];
+ }
+ else {
+ assert(0);
+ /* 1D texture, using linear memory. */
+ }
+
+ mem.device_pointer = (device_ptr)mtlTexture;
+ mem.device_size = size;
+ stats.mem_alloc(size);
+
+ std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+ MetalMem *mmem = new MetalMem;
+ metal_mem_map[&mem] = std::unique_ptr<MetalMem>(mmem);
+ mmem->mem = &mem;
+ mmem->mtlTexture = mtlTexture;
+
+ /* Resize once */
+ const uint slot = mem.slot;
+ if (slot >= texture_info.size()) {
+ /* Allocate some slots in advance, to reduce amount
+ * of re-allocations. */
+ texture_info.resize(slot + 128);
+ texture_slot_map.resize(slot + 128);
+
+ ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
+ if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
+ if (texture_bindings_2d) {
+ delayed_free_list.push_back(texture_bindings_2d);
+ delayed_free_list.push_back(texture_bindings_3d);
+
+ stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
+ }
+ texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
+ options:default_storage_mode];
+ texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
+ options:default_storage_mode];
+
+ stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
+ }
+ }
+
+ /* optimise the texture for GPU access */
+ id<MTLCommandBuffer> commandBuffer = [mtlGeneralCommandQueue commandBuffer];
+ id<MTLBlitCommandEncoder> blitCommandEncoder = [commandBuffer blitCommandEncoder];
+ [blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture];
+ [blitCommandEncoder endEncoding];
+ [commandBuffer commit];
+
+ /* Set Mapping and tag that we need to (re-)upload to device */
+ texture_slot_map[slot] = mtlTexture;
+ texture_info[slot] = mem.info;
+ need_texture_info = true;
+
+ texture_info[slot].data = uint64_t(slot) | (sampler_index << 32);
+}
+
+void MetalDevice::tex_free(device_texture &mem)
+{
+ if (metal_mem_map.count(&mem)) {
+ std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+ MetalMem &mmem = *metal_mem_map.at(&mem);
+
+ assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
+ texture_slot_map[mem.slot] = nil;
+
+ if (mmem.mtlTexture) {
+ /* Free bindless texture. */
+ delayed_free_list.push_back(mmem.mtlTexture);
+ mmem.mtlTexture = nil;
+ }
+ stats.mem_free(mem.device_size);
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+ metal_mem_map.erase(&mem);
+ }
+}
+
+unique_ptr<DeviceQueue> MetalDevice::gpu_queue_create()
+{
+ return make_unique<MetalDeviceQueue>(this);
+}
+
+bool MetalDevice::should_use_graphics_interop()
+{
+ /* METAL_WIP - provide fast interop */
+ return false;
+}
+
+void MetalDevice::flush_delayed_free_list()
+{
+ /* free any Metal buffers that may have been freed by host while a command
+ * buffer was being generated. This function should be called after each
+ * completion of a command buffer */
+ std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+ for (auto &it : delayed_free_list) {
+ [it release];
+ }
+ delayed_free_list.clear();
+}
+
+void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+ if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
+ Device::build_bvh(bvh, progress, refit);
+ return;
+ }
+
+ BVHMetal *bvh_metal = static_cast<BVHMetal *>(bvh);
+ bvh_metal->motion_blur = motion_blur;
+ if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue, refit)) {
+
+ if (@available(macos 11.0, *)) {
+ if (bvh->params.top_level) {
+ bvhMetalRT = bvh_metal;
+ }
+ }
+ }
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
new file mode 100644
index 00000000000..d4941627fd8
--- /dev/null
+++ b/intern/cycles/device/metal/kernel.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_METAL
+
+# include "device/kernel.h"
+# include <Metal/Metal.h>
+
+CCL_NAMESPACE_BEGIN
+
+class MetalDevice;
+
+enum {
+ METALRT_FUNC_DEFAULT_TRI,
+ METALRT_FUNC_DEFAULT_BOX,
+ METALRT_FUNC_SHADOW_TRI,
+ METALRT_FUNC_SHADOW_BOX,
+ METALRT_FUNC_LOCAL_TRI,
+ METALRT_FUNC_LOCAL_BOX,
+ METALRT_FUNC_CURVE_RIBBON,
+ METALRT_FUNC_CURVE_RIBBON_SHADOW,
+ METALRT_FUNC_CURVE_ALL,
+ METALRT_FUNC_CURVE_ALL_SHADOW,
+ METALRT_FUNC_NUM
+};
+
+enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
+
+/* Pipeline State Object types */
+enum {
+ /* A kernel that can be used with all scenes, supporting all features.
+ * It is slow to compile, but only needs to be compiled once and is then
+ * cached for future render sessions. This allows a render to get underway
+ * on the GPU quickly.
+ */
+ PSO_GENERIC,
+
+ /* A kernel that is relatively quick to compile, but is specialised for the
+ * scene being rendered. It only contains the functionality and even baked in
+ * constants for values that means it needs to be recompiled whenever a
+ * dependent setting is changed. The render performance of this kernel is
+ * significantly faster though, and justifies the extra compile time.
+ */
+ /* METAL_WIP: This isn't used and will require more changes to enable. */
+ PSO_SPECIALISED,
+
+ PSO_NUM
+};
+
+const char *kernel_type_as_string(int kernel_type);
+
+struct MetalKernelPipeline {
+ void release()
+ {
+ if (pipeline) {
+ [pipeline release];
+ pipeline = nil;
+ if (@available(macOS 11.0, *)) {
+ for (int i = 0; i < METALRT_TABLE_NUM; i++) {
+ if (intersection_func_table[i]) {
+ [intersection_func_table[i] release];
+ intersection_func_table[i] = nil;
+ }
+ }
+ }
+ }
+ if (function) {
+ [function release];
+ function = nil;
+ }
+ if (@available(macOS 11.0, *)) {
+ for (int i = 0; i < METALRT_TABLE_NUM; i++) {
+ if (intersection_func_table[i]) {
+ [intersection_func_table[i] release];
+ }
+ }
+ }
+ }
+
+ bool loaded = false;
+ id<MTLFunction> function = nil;
+ id<MTLComputePipelineState> pipeline = nil;
+
+ API_AVAILABLE(macos(11.0))
+ id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
+};
+
+struct MetalKernelLoadDesc {
+ int pso_index = 0;
+ const char *function_name = nullptr;
+ int kernel_index = 0;
+ int threads_per_threadgroup = 0;
+ MTLFunctionConstantValues *constant_values = nullptr;
+ NSArray *linked_functions = nullptr;
+
+ struct IntersectorFunctions {
+ NSArray *defaults;
+ NSArray *shadow;
+ NSArray *local;
+ NSArray *operator[](int index) const
+ {
+ if (index == METALRT_TABLE_DEFAULT)
+ return defaults;
+ if (index == METALRT_TABLE_SHADOW)
+ return shadow;
+ return local;
+ }
+ } intersector_functions = {nullptr};
+};
+
+/* Metal kernel and associate occupancy information. */
+class MetalDeviceKernel {
+ public:
+ ~MetalDeviceKernel();
+
+ bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
+
+ void mark_loaded(int pso_index)
+ {
+ pso[pso_index].loaded = true;
+ }
+
+ int get_num_threads_per_block() const
+ {
+ return num_threads_per_block;
+ }
+ const MetalKernelPipeline &get_pso() const;
+
+ double load_duration = 0.0;
+
+ private:
+ MetalKernelPipeline pso[PSO_NUM];
+
+ int num_threads_per_block = 0;
+};
+
+/* Cache of Metal kernels for each DeviceKernel. */
+class MetalDeviceKernels {
+ public:
+ bool load(MetalDevice *device, int kernel_type);
+ bool available(DeviceKernel kernel) const;
+ const MetalDeviceKernel &get(DeviceKernel kernel) const;
+
+ MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];
+
+ id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
+
+ string loaded_md5[PSO_NUM];
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
new file mode 100644
index 00000000000..9cbad10bf5a
--- /dev/null
+++ b/intern/cycles/device/metal/kernel.mm
@@ -0,0 +1,523 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+# include "device/metal/kernel.h"
+# include "device/metal/device_impl.h"
+# include "util/md5.h"
+# include "util/path.h"
+# include "util/tbb.h"
+# include "util/time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* limit to 2 MTLCompiler instances */
+int max_mtlcompiler_threads = 2;
+
+const char *kernel_type_as_string(int kernel_type)
+{
+ switch (kernel_type) {
+ case PSO_GENERIC:
+ return "PSO_GENERIC";
+ case PSO_SPECIALISED:
+ return "PSO_SPECIALISED";
+ default:
+ assert(0);
+ }
+ return "";
+}
+
+MetalDeviceKernel::~MetalDeviceKernel()
+{
+ for (int i = 0; i < PSO_NUM; i++) {
+ pso[i].release();
+ }
+}
+
+bool MetalDeviceKernel::load(MetalDevice *device,
+ MetalKernelLoadDesc const &desc_in,
+ MD5Hash const &md5)
+{
+ __block MetalKernelLoadDesc const desc(desc_in);
+ if (desc.kernel_index == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ /* skip megakernel */
+ return true;
+ }
+
+ bool use_binary_archive = true;
+ if (getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+ use_binary_archive = false;
+ }
+
+ id<MTLBinaryArchive> archive = nil;
+ string metalbin_path;
+ if (use_binary_archive) {
+ NSProcessInfo *processInfo = [NSProcessInfo processInfo];
+ string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
+ MD5Hash local_md5(md5);
+ local_md5.append(osVersion);
+ string metalbin_name = string(desc.function_name) + "." + local_md5.get_hex() +
+ to_string(desc.pso_index) + ".bin";
+ metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
+ path_create_directories(metalbin_path);
+
+ if (path_exists(metalbin_path) && use_binary_archive) {
+ if (@available(macOS 11.0, *)) {
+ MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
+ archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
+ archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
+ [archiveDesc release];
+ }
+ }
+ }
+
+ NSString *entryPoint = [@(desc.function_name) copy];
+
+ NSError *error = NULL;
+ if (@available(macOS 11.0, *)) {
+ MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
+ func_desc.name = entryPoint;
+ if (desc.constant_values) {
+ func_desc.constantValues = desc.constant_values;
+ }
+ pso[desc.pso_index].function = [device->mtlLibrary[desc.pso_index]
+ newFunctionWithDescriptor:func_desc
+ error:&error];
+ }
+ [entryPoint release];
+
+ if (pso[desc.pso_index].function == nil) {
+ NSString *err = [error localizedDescription];
+ string errors = [err UTF8String];
+
+ device->set_error(
+ string_printf("Error getting function \"%s\": %s", desc.function_name, errors.c_str()));
+ return false;
+ }
+
+ pso[desc.pso_index].function.label = [@(desc.function_name) copy];
+
+ __block MTLComputePipelineDescriptor *computePipelineStateDescriptor =
+ [[MTLComputePipelineDescriptor alloc] init];
+
+ computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
+ computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
+ computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
+
+ computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup;
+ computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
+
+ computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function;
+ if (@available(macOS 11.0, *)) {
+ /* Attach the additional functions to an MTLLinkedFunctions object */
+ if (desc.linked_functions) {
+ computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
+ computePipelineStateDescriptor.linkedFunctions.functions = desc.linked_functions;
+ }
+
+ computePipelineStateDescriptor.maxCallStackDepth = 1;
+ }
+
+ /* Create a new Compute pipeline state object */
+ MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
+
+ bool creating_new_archive = false;
+ if (@available(macOS 11.0, *)) {
+ if (use_binary_archive) {
+ if (!archive) {
+ MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
+ archiveDesc.url = nil;
+ archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
+ creating_new_archive = true;
+
+ double starttime = time_dt();
+
+ if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
+ error:&error]) {
+ NSString *errStr = [error localizedDescription];
+ metal_printf("Failed to add PSO to archive:\n%s\n",
+ errStr ? [errStr UTF8String] : "nil");
+ }
+ else {
+ double duration = time_dt() - starttime;
+ metal_printf("%2d | %-55s | %7.2fs\n",
+ desc.kernel_index,
+ device_kernel_as_string((DeviceKernel)desc.kernel_index),
+ duration);
+
+ if (desc.pso_index == PSO_GENERIC) {
+ this->load_duration = duration;
+ }
+ }
+ }
+ computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
+ pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
+ }
+ }
+
+ double starttime = time_dt();
+
+ MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
+ id<MTLComputePipelineState> computePipelineState,
+ MTLComputePipelineReflection *reflection,
+ NSError *error) {
+ bool recreate_archive = false;
+ if (computePipelineState == nil && archive && !creating_new_archive) {
+
+ assert(0);
+
+ NSString *errStr = [error localizedDescription];
+ metal_printf(
+ "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
+ "(error: %s)\n",
+ device_kernel_as_string((DeviceKernel)desc.kernel_index),
+ errStr ? [errStr UTF8String] : "nil");
+ computePipelineState = [device->mtlDevice
+ newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+ options:MTLPipelineOptionNone
+ reflection:nullptr
+ error:&error];
+ recreate_archive = true;
+ }
+
+ double duration = time_dt() - starttime;
+
+ if (computePipelineState == nil) {
+ NSString *errStr = [error localizedDescription];
+ device->set_error(string_printf("Failed to create compute pipeline state \"%s\", error: \n",
+ device_kernel_as_string((DeviceKernel)desc.kernel_index)) +
+ (errStr ? [errStr UTF8String] : "nil"));
+ metal_printf("%2d | %-55s | %7.2fs | FAILED!\n",
+ desc.kernel_index,
+ device_kernel_as_string((DeviceKernel)desc.kernel_index),
+ duration);
+ return;
+ }
+
+ pso[desc.pso_index].pipeline = computePipelineState;
+ num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+ computePipelineState.threadExecutionWidth);
+ num_threads_per_block = std::max(num_threads_per_block,
+ (int)computePipelineState.threadExecutionWidth);
+
+ if (!use_binary_archive) {
+ metal_printf("%2d | %-55s | %7.2fs\n",
+ desc.kernel_index,
+ device_kernel_as_string((DeviceKernel)desc.kernel_index),
+ duration);
+
+ if (desc.pso_index == PSO_GENERIC) {
+ this->load_duration = duration;
+ }
+ }
+
+ if (@available(macOS 11.0, *)) {
+ if (creating_new_archive || recreate_archive) {
+ if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
+ error:&error]) {
+ metal_printf("Failed to save binary archive, error:\n%s\n",
+ [[error localizedDescription] UTF8String]);
+ }
+ }
+ }
+
+ [computePipelineStateDescriptor release];
+ computePipelineStateDescriptor = nil;
+
+ if (device->use_metalrt && desc.linked_functions) {
+ for (int table = 0; table < METALRT_TABLE_NUM; table++) {
+ if (@available(macOS 11.0, *)) {
+ MTLIntersectionFunctionTableDescriptor *ift_desc =
+ [[MTLIntersectionFunctionTableDescriptor alloc] init];
+ ift_desc.functionCount = desc.intersector_functions[table].count;
+
+ pso[desc.pso_index].intersection_func_table[table] = [pso[desc.pso_index].pipeline
+ newIntersectionFunctionTableWithDescriptor:ift_desc];
+
+ /* Finally write the function handles into this pipeline's table */
+ for (int i = 0; i < 2; i++) {
+ id<MTLFunctionHandle> handle = [pso[desc.pso_index].pipeline
+ functionHandleWithFunction:desc.intersector_functions[table][i]];
+ [pso[desc.pso_index].intersection_func_table[table] setFunction:handle atIndex:i];
+ }
+ }
+ }
+ }
+
+ mark_loaded(desc.pso_index);
+ };
+
+ if (desc.pso_index == PSO_SPECIALISED) {
+ /* Asynchronous load */
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+ NSError *error;
+ id<MTLComputePipelineState> pipeline = [device->mtlDevice
+ newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+ options:pipelineOptions
+ reflection:nullptr
+ error:&error];
+ completionHandler(pipeline, nullptr, error);
+ });
+ }
+ else {
+ /* Block on load to ensure we continue with a valid kernel function */
+ id<MTLComputePipelineState> pipeline = [device->mtlDevice
+ newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+ options:pipelineOptions
+ reflection:nullptr
+ error:&error];
+ completionHandler(pipeline, nullptr, error);
+ }
+
+ return true;
+}
+
+const MetalKernelPipeline &MetalDeviceKernel::get_pso() const
+{
+ if (pso[PSO_SPECIALISED].loaded) {
+ return pso[PSO_SPECIALISED];
+ }
+
+ assert(pso[PSO_GENERIC].loaded);
+ return pso[PSO_GENERIC];
+}
+
+bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type)
+{
+ bool any_error = false;
+
+ MD5Hash md5;
+
+ /* Build the function constant table */
+ MTLFunctionConstantValues *constant_values = nullptr;
+ if (kernel_type == PSO_SPECIALISED) {
+ constant_values = [MTLFunctionConstantValues new];
+
+# define KERNEL_FILM(_type, name) \
+ [constant_values setConstantValue:&data.film.name \
+ type:get_MTLDataType_##_type() \
+ atIndex:KernelData_film_##name]; \
+ md5.append((uint8_t *)&data.film.name, sizeof(data.film.name));
+
+# define KERNEL_BACKGROUND(_type, name) \
+ [constant_values setConstantValue:&data.background.name \
+ type:get_MTLDataType_##_type() \
+ atIndex:KernelData_background_##name]; \
+ md5.append((uint8_t *)&data.background.name, sizeof(data.background.name));
+
+# define KERNEL_INTEGRATOR(_type, name) \
+ [constant_values setConstantValue:&data.integrator.name \
+ type:get_MTLDataType_##_type() \
+ atIndex:KernelData_integrator_##name]; \
+ md5.append((uint8_t *)&data.integrator.name, sizeof(data.integrator.name));
+
+# define KERNEL_BVH(_type, name) \
+ [constant_values setConstantValue:&data.bvh.name \
+ type:get_MTLDataType_##_type() \
+ atIndex:KernelData_bvh_##name]; \
+ md5.append((uint8_t *)&data.bvh.name, sizeof(data.bvh.name));
+
+ /* METAL_WIP: populate constant_values based on KernelData */
+ assert(0);
+ /*
+ const KernelData &data = device->launch_params.data;
+ # include "kernel/types/background.h"
+ # include "kernel/types/bvh.h"
+ # include "kernel/types/film.h"
+ # include "kernel/types/integrator.h"
+ */
+ }
+
+ if (device->use_metalrt) {
+ if (@available(macOS 11.0, *)) {
+ /* create the id<MTLFunction> for each intersection function */
+ const char *function_names[] = {
+ "__anyhit__cycles_metalrt_visibility_test_tri",
+ "__anyhit__cycles_metalrt_visibility_test_box",
+ "__anyhit__cycles_metalrt_shadow_all_hit_tri",
+ "__anyhit__cycles_metalrt_shadow_all_hit_box",
+ "__anyhit__cycles_metalrt_local_hit_tri",
+ "__anyhit__cycles_metalrt_local_hit_box",
+ "__intersection__curve_ribbon",
+ "__intersection__curve_ribbon_shadow",
+ "__intersection__curve_all",
+ "__intersection__curve_all_shadow",
+ };
+ assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);
+
+ MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
+ if (kernel_type == PSO_SPECIALISED) {
+ desc.constantValues = constant_values;
+ }
+ for (int i = 0; i < METALRT_FUNC_NUM; i++) {
+ const char *function_name = function_names[i];
+ desc.name = [@(function_name) copy];
+
+ NSError *error = NULL;
+ rt_intersection_funcs[kernel_type][i] = [device->mtlLibrary[kernel_type]
+ newFunctionWithDescriptor:desc
+ error:&error];
+
+ if (rt_intersection_funcs[kernel_type][i] == nil) {
+ NSString *err = [error localizedDescription];
+ string errors = [err UTF8String];
+
+ device->set_error(string_printf(
+ "Error getting intersection function \"%s\": %s", function_name, errors.c_str()));
+ any_error = true;
+ break;
+ }
+
+ rt_intersection_funcs[kernel_type][i].label = [@(function_name) copy];
+ }
+ }
+ }
+ md5.append(device->source_used_for_compile[kernel_type]);
+
+ string hash = md5.get_hex();
+ if (loaded_md5[kernel_type] == hash) {
+ return true;
+ }
+
+ if (!any_error) {
+ NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
+ NSArray *function_list = nil;
+
+ if (device->use_metalrt) {
+ id<MTLFunction> box_intersect_default = nil;
+ id<MTLFunction> box_intersect_shadow = nil;
+ if (device->kernel_features & KERNEL_FEATURE_HAIR) {
+ /* Add curve intersection programs. */
+ if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+ /* Slower programs for thick hair since that also slows down ribbons.
+ * Ideally this should not be needed. */
+ box_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL];
+ box_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW];
+ }
+ else {
+ box_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON];
+ box_intersect_shadow =
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW];
+ }
+ }
+ table_functions[METALRT_TABLE_DEFAULT] = [NSArray
+ arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI],
+ box_intersect_default ?
+ box_intersect_default :
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
+ nil];
+ table_functions[METALRT_TABLE_SHADOW] = [NSArray
+ arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI],
+ box_intersect_shadow ?
+ box_intersect_shadow :
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
+ nil];
+ table_functions[METALRT_TABLE_LOCAL] = [NSArray
+ arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI],
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
+ nil];
+
+ NSMutableSet *unique_functions = [NSMutableSet
+ setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
+ [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
+ [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
+
+ function_list = [[NSArray arrayWithArray:[unique_functions allObjects]]
+ sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
+ return [f1.label compare:f2.label];
+ }];
+
+ unique_functions = nil;
+ }
+
+ metal_printf("Starting %s \"cycles_metal_...\" pipeline builds\n",
+ kernel_type_as_string(kernel_type));
+
+ tbb::task_arena local_arena(max_mtlcompiler_threads);
+ local_arena.execute([&]() {
+ tbb::parallel_for(int(0), int(DEVICE_KERNEL_NUM), [&](int i) {
+ /* skip megakernel */
+ if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ return;
+ }
+
+ /* Only specialise kernels where it can make an impact */
+ if (kernel_type == PSO_SPECIALISED) {
+ if (i < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+ i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ return;
+ }
+ }
+
+ MetalDeviceKernel &kernel = kernels_[i];
+
+ const std::string function_name = std::string("cycles_metal_") +
+ device_kernel_as_string((DeviceKernel)i);
+ int threads_per_threadgroup = device->max_threads_per_threadgroup;
+ if (i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && i < DEVICE_KERNEL_INTEGRATOR_RESET) {
+ /* Always use 512 for the sorting kernels */
+ threads_per_threadgroup = 512;
+ }
+
+ NSArray *kernel_function_list = nil;
+
+ if (i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+ i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+ i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+ i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+ i == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ kernel_function_list = function_list;
+ }
+
+ MetalKernelLoadDesc desc;
+ desc.pso_index = kernel_type;
+ desc.kernel_index = i;
+ desc.linked_functions = kernel_function_list;
+ desc.intersector_functions.defaults = table_functions[METALRT_TABLE_DEFAULT];
+ desc.intersector_functions.shadow = table_functions[METALRT_TABLE_SHADOW];
+ desc.intersector_functions.local = table_functions[METALRT_TABLE_LOCAL];
+ desc.constant_values = constant_values;
+ desc.threads_per_threadgroup = threads_per_threadgroup;
+ desc.function_name = function_name.c_str();
+
+ bool success = kernel.load(device, desc, md5);
+
+ any_error |= !success;
+ });
+ });
+ }
+
+ bool loaded = !any_error;
+ if (loaded) {
+ loaded_md5[kernel_type] = hash;
+ }
+ return loaded;
+}
+
+const MetalDeviceKernel &MetalDeviceKernels::get(DeviceKernel kernel) const
+{
+ return kernels_[(int)kernel];
+}
+
+bool MetalDeviceKernels::available(DeviceKernel kernel) const
+{
+ return kernels_[(int)kernel].get_pso().function != nil;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL*/
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
new file mode 100644
index 00000000000..7aafcb2efe4
--- /dev/null
+++ b/intern/cycles/device/metal/queue.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_METAL
+
+# include "device/kernel.h"
+# include "device/memory.h"
+# include "device/queue.h"
+
+# include "device/metal/util.h"
+# include "kernel/device/metal/globals.h"
+
+# define metal_printf VLOG(4) << string_printf
+
+CCL_NAMESPACE_BEGIN
+
+class MetalDevice;
+
+/* Base class for Metal queues. */
+class MetalDeviceQueue : public DeviceQueue {
+ public:
+ MetalDeviceQueue(MetalDevice *device);
+ ~MetalDeviceQueue();
+
+ virtual int num_concurrent_states(const size_t) const override;
+ virtual int num_concurrent_busy_states() const override;
+
+ virtual void init_execution() override;
+
+ virtual bool enqueue(DeviceKernel kernel,
+ const int work_size,
+ DeviceKernelArguments const &args) override;
+
+ virtual bool synchronize() override;
+
+ virtual void zero_to_device(device_memory &mem) override;
+ virtual void copy_to_device(device_memory &mem) override;
+ virtual void copy_from_device(device_memory &mem) override;
+
+ virtual bool kernel_available(DeviceKernel kernel) const override;
+
+ protected:
+ void prepare_resources(DeviceKernel kernel);
+
+ id<MTLComputeCommandEncoder> get_compute_encoder(DeviceKernel kernel);
+ id<MTLBlitCommandEncoder> get_blit_encoder();
+
+ MetalDevice *metal_device;
+ MetalBufferPool temp_buffer_pool;
+
+ API_AVAILABLE(macos(11.0), ios(14.0))
+ MTLCommandBufferDescriptor *command_buffer_desc = nullptr;
+ id<MTLDevice> mtlDevice = nil;
+ id<MTLCommandQueue> mtlCommandQueue = nil;
+ id<MTLCommandBuffer> mtlCommandBuffer = nil;
+ id<MTLComputeCommandEncoder> mtlComputeEncoder = nil;
+ id<MTLBlitCommandEncoder> mtlBlitEncoder = nil;
+ id<MTLSharedEvent> shared_event = nil;
+ MTLSharedEventListener *shared_event_listener = nil;
+
+ dispatch_queue_t event_queue;
+ dispatch_semaphore_t wait_semaphore;
+
+ struct CopyBack {
+ void *host_pointer;
+ void *gpu_mem;
+ uint64_t size;
+ };
+ std::vector<CopyBack> copy_back_mem;
+
+ uint64_t shared_event_id;
+ uint64_t command_buffers_submitted = 0;
+ uint64_t command_buffers_completed = 0;
+ Stats &stats;
+
+ void close_compute_encoder();
+ void close_blit_encoder();
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
new file mode 100644
index 00000000000..2545fa0f3b0
--- /dev/null
+++ b/intern/cycles/device/metal/queue.mm
@@ -0,0 +1,602 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+# include "device/metal/queue.h"
+
+# include "device/metal/device_impl.h"
+# include "device/metal/kernel.h"
+
+# include "util/path.h"
+# include "util/string.h"
+# include "util/time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* MetalDeviceQueue */
+
+MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
+ : DeviceQueue(device), metal_device(device), stats(device->stats)
+{
+ if (@available(macos 11.0, *)) {
+ command_buffer_desc = [[MTLCommandBufferDescriptor alloc] init];
+ command_buffer_desc.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus;
+ }
+
+ mtlDevice = device->mtlDevice;
+ mtlCommandQueue = [mtlDevice newCommandQueue];
+
+ shared_event = [mtlDevice newSharedEvent];
+ shared_event_id = 1;
+
+ /* Shareable event listener */
+ event_queue = dispatch_queue_create("com.cycles.metal.event_queue", NULL);
+ shared_event_listener = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue];
+
+ wait_semaphore = dispatch_semaphore_create(0);
+}
+
+MetalDeviceQueue::~MetalDeviceQueue()
+{
+ /* Tidying up here isn't really practical - we should expect and require the work
+ * queue to be empty here. */
+ assert(mtlCommandBuffer == nil);
+ assert(command_buffers_submitted == command_buffers_completed);
+
+ [shared_event_listener release];
+ [shared_event release];
+
+ if (@available(macos 11.0, *)) {
+ [command_buffer_desc release];
+ }
+ if (mtlCommandQueue) {
+ [mtlCommandQueue release];
+ mtlCommandQueue = nil;
+ }
+}
+
+int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+{
+ /* METAL_WIP */
+ /* TODO: compute automatically. */
+ /* TODO: must have at least num_threads_per_block. */
+ int result = 1048576;
+ if (metal_device->device_vendor == METAL_GPU_AMD) {
+ result *= 2;
+ }
+ else if (metal_device->device_vendor == METAL_GPU_APPLE) {
+ result *= 4;
+ }
+ return result;
+}
+
+int MetalDeviceQueue::num_concurrent_busy_states() const
+{
+ /* METAL_WIP */
+ /* TODO: compute automatically. */
+ int result = 65536;
+ if (metal_device->device_vendor == METAL_GPU_AMD) {
+ result *= 2;
+ }
+ else if (metal_device->device_vendor == METAL_GPU_APPLE) {
+ result *= 4;
+ }
+ return result;
+}
+
+void MetalDeviceQueue::init_execution()
+{
+ /* Synchronize all textures and memory copies before executing task. */
+ metal_device->load_texture_info();
+
+ synchronize();
+}
+
+bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
+ const int work_size,
+ DeviceKernelArguments const &args)
+{
+ if (metal_device->have_error()) {
+ return false;
+ }
+
+ VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
+ << work_size;
+
+ const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
+ const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
+
+ id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
+
+ /* Determine size requirement for argument buffer. */
+ size_t arg_buffer_length = 0;
+ for (size_t i = 0; i < args.count; i++) {
+ size_t size_in_bytes = args.sizes[i];
+ arg_buffer_length = round_up(arg_buffer_length, size_in_bytes) + size_in_bytes;
+ }
+ /* 256 is the Metal offset alignment for constant address space bindings */
+ arg_buffer_length = round_up(arg_buffer_length, 256);
+
+ /* Globals placed after "vanilla" arguments. */
+ size_t globals_offsets = arg_buffer_length;
+ arg_buffer_length += sizeof(KernelParamsMetal);
+ arg_buffer_length = round_up(arg_buffer_length, 256);
+
+ /* Metal ancilliary bindless pointers */
+ size_t metal_offsets = arg_buffer_length;
+ arg_buffer_length += metal_device->mtlAncillaryArgEncoder.encodedLength;
+ arg_buffer_length = round_up(arg_buffer_length, metal_device->mtlAncillaryArgEncoder.alignment);
+
+ /* Temporary buffer used to prepare arg_buffer */
+ uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length);
+ memset(init_arg_buffer, 0, arg_buffer_length);
+
+ /* Prepare the non-pointer "enqueue" arguments */
+ size_t bytes_written = 0;
+ for (size_t i = 0; i < args.count; i++) {
+ size_t size_in_bytes = args.sizes[i];
+ bytes_written = round_up(bytes_written, size_in_bytes);
+ if (args.types[i] != DeviceKernelArguments::POINTER) {
+ memcpy(init_arg_buffer + bytes_written, args.values[i], size_in_bytes);
+ }
+ bytes_written += size_in_bytes;
+ }
+
+ /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
+ /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
+ size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, __integrator_state) +
+ sizeof(IntegratorStateGPU);
+ size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
+ memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
+ (uint8_t *)&metal_device->launch_params + plain_old_launch_data_offset,
+ plain_old_launch_data_size);
+
+ /* Allocate an argument buffer. */
+ MTLResourceOptions arg_buffer_options = MTLResourceStorageModeManaged;
+ if (@available(macOS 11.0, *)) {
+ if ([mtlDevice hasUnifiedMemory]) {
+ arg_buffer_options = MTLResourceStorageModeShared;
+ }
+ }
+
+ id<MTLBuffer> arg_buffer = temp_buffer_pool.get_buffer(
+ mtlDevice, mtlCommandBuffer, arg_buffer_length, arg_buffer_options, init_arg_buffer, stats);
+
+ /* Encode the pointer "enqueue" arguments */
+ bytes_written = 0;
+ for (size_t i = 0; i < args.count; i++) {
+ size_t size_in_bytes = args.sizes[i];
+ bytes_written = round_up(bytes_written, size_in_bytes);
+ if (args.types[i] == DeviceKernelArguments::POINTER) {
+ [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
+ offset:bytes_written];
+ if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) {
+ [mtlComputeCommandEncoder useResource:mmem->mtlBuffer
+ usage:MTLResourceUsageRead | MTLResourceUsageWrite];
+ [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
+ }
+ else {
+ if (@available(macos 12.0, *)) {
+ [metal_device->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0];
+ }
+ }
+ }
+ bytes_written += size_in_bytes;
+ }
+
+ /* Encode KernelParamsMetal buffers */
+ [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets];
+
+ /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */
+ const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) +
+ sizeof(IntegratorStateGPU);
+ for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
+ int pointer_index = offset / sizeof(device_ptr);
+ MetalDevice::MetalMem *mmem = *(
+ MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset);
+ if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) {
+ [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
+ offset:0
+ atIndex:pointer_index];
+ }
+ else {
+ if (@available(macos 12.0, *)) {
+ [metal_device->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:pointer_index];
+ }
+ }
+ }
+ bytes_written = globals_offsets + sizeof(KernelParamsMetal);
+
+ /* Encode ancillaries */
+ [metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
+ [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
+ offset:0
+ atIndex:0];
+ [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_3d
+ offset:0
+ atIndex:1];
+ if (@available(macos 12.0, *)) {
+ if (metal_device->use_metalrt) {
+ if (metal_device->bvhMetalRT) {
+ id<MTLAccelerationStructure> accel_struct = metal_device->bvhMetalRT->accel_struct;
+ [metal_device->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
+ }
+
+ for (int table = 0; table < METALRT_TABLE_NUM; table++) {
+ if (metal_kernel_pso.intersection_func_table[table]) {
+ [metal_kernel_pso.intersection_func_table[table] setBuffer:arg_buffer
+ offset:globals_offsets
+ atIndex:1];
+ [metal_device->mtlAncillaryArgEncoder
+ setIntersectionFunctionTable:metal_kernel_pso.intersection_func_table[table]
+ atIndex:3 + table];
+ [mtlComputeCommandEncoder useResource:metal_kernel_pso.intersection_func_table[table]
+ usage:MTLResourceUsageRead];
+ }
+ else {
+ [metal_device->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
+ atIndex:3 + table];
+ }
+ }
+ }
+ bytes_written = metal_offsets + metal_device->mtlAncillaryArgEncoder.encodedLength;
+ }
+
+ if (arg_buffer.storageMode == MTLStorageModeManaged) {
+ [arg_buffer didModifyRange:NSMakeRange(0, bytes_written)];
+ }
+
+ [mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0];
+ [mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1];
+ [mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2];
+
+ if (metal_device->use_metalrt) {
+ if (@available(macos 12.0, *)) {
+
+ auto bvhMetalRT = metal_device->bvhMetalRT;
+ switch (kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+ break;
+ default:
+ bvhMetalRT = nil;
+ break;
+ }
+
+ if (bvhMetalRT) {
+ /* Mark all Accelerations resources as used */
+ [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
+ [mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
+ count:bvhMetalRT->blas_array.size()
+ usage:MTLResourceUsageRead];
+ }
+ }
+ }
+
+ [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
+
+ /* Compute kernel launch parameters. */
+ const int num_threads_per_block = metal_kernel.get_num_threads_per_block();
+
+ int shared_mem_bytes = 0;
+
+ switch (kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
+ /* See parallel_active_index.h for why this amount of shared memory is needed.
+ * Rounded up to 16 bytes for Metal */
+ shared_mem_bytes = round_up((num_threads_per_block + 1) * sizeof(int), 16);
+ [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
+ break;
+
+ default:
+ break;
+ }
+
+ MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
+ divide_up(work_size, num_threads_per_block), 1, 1);
+ MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
+ [mtlComputeCommandEncoder dispatchThreadgroups:size_threadgroups_per_dispatch
+ threadsPerThreadgroup:size_threads_per_threadgroup];
+
+ [mtlCommandBuffer addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+ NSString *kernel_name = metal_kernel_pso.function.label;
+
+ /* Enhanced command buffer errors are only available in 11.0+ */
+ if (@available(macos 11.0, *)) {
+ if (command_buffer.status == MTLCommandBufferStatusError && command_buffer.error != nil) {
+ printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]);
+ NSArray<id<MTLCommandBufferEncoderInfo>> *encoderInfos = [command_buffer.error.userInfo
+ valueForKey:MTLCommandBufferEncoderInfoErrorKey];
+ if (encoderInfos != nil) {
+ for (id<MTLCommandBufferEncoderInfo> encoderInfo : encoderInfos) {
+ NSLog(@"%@", encoderInfo);
+ }
+ }
+ id<MTLLogContainer> logs = command_buffer.logs;
+ for (id<MTLFunctionLog> log in logs) {
+ NSLog(@"%@", log);
+ }
+ }
+ else if (command_buffer.error) {
+ printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]);
+ }
+ }
+ }];
+
+ return !(metal_device->have_error());
+}
+
+bool MetalDeviceQueue::synchronize()
+{
+ if (metal_device->have_error()) {
+ return false;
+ }
+
+ if (mtlComputeEncoder) {
+ close_compute_encoder();
+ }
+ close_blit_encoder();
+
+ if (mtlCommandBuffer) {
+ uint64_t shared_event_id = this->shared_event_id++;
+
+ __block dispatch_semaphore_t block_sema = wait_semaphore;
+ [shared_event notifyListener:shared_event_listener
+ atValue:shared_event_id
+ block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) {
+ dispatch_semaphore_signal(block_sema);
+ }];
+
+ [mtlCommandBuffer encodeSignalEvent:shared_event value:shared_event_id];
+ [mtlCommandBuffer commit];
+ dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER);
+
+ [mtlCommandBuffer release];
+
+ for (const CopyBack &mmem : copy_back_mem) {
+ memcpy((uchar *)mmem.host_pointer, (uchar *)mmem.gpu_mem, mmem.size);
+ }
+ copy_back_mem.clear();
+
+ temp_buffer_pool.process_command_buffer_completion(mtlCommandBuffer);
+ metal_device->flush_delayed_free_list();
+
+ mtlCommandBuffer = nil;
+ }
+
+ return !(metal_device->have_error());
+}
+
+void MetalDeviceQueue::zero_to_device(device_memory &mem)
+{
+ assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+ if (mem.memory_size() == 0) {
+ return;
+ }
+
+ /* Allocate on demand. */
+ if (mem.device_pointer == 0) {
+ metal_device->mem_alloc(mem);
+ }
+
+ /* Zero memory on device. */
+ assert(mem.device_pointer != 0);
+
+ std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
+ MetalDevice::MetalMem &mmem = *metal_device->metal_mem_map.at(&mem);
+ if (mmem.mtlBuffer) {
+ id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
+ [blitEncoder fillBuffer:mmem.mtlBuffer range:NSMakeRange(mmem.offset, mmem.size) value:0];
+ }
+ else {
+ metal_device->mem_zero(mem);
+ }
+}
+
+void MetalDeviceQueue::copy_to_device(device_memory &mem)
+{
+ if (mem.memory_size() == 0) {
+ return;
+ }
+
+ /* Allocate on demand. */
+ if (mem.device_pointer == 0) {
+ metal_device->mem_alloc(mem);
+ }
+
+ assert(mem.device_pointer != 0);
+ assert(mem.host_pointer != nullptr);
+
+ std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
+ auto result = metal_device->metal_mem_map.find(&mem);
+ if (result != metal_device->metal_mem_map.end()) {
+ if (mem.host_pointer == mem.shared_pointer) {
+ return;
+ }
+
+ MetalDevice::MetalMem &mmem = *result->second;
+ id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
+
+ id<MTLBuffer> buffer = temp_buffer_pool.get_buffer(mtlDevice,
+ mtlCommandBuffer,
+ mmem.size,
+ MTLResourceStorageModeShared,
+ mem.host_pointer,
+ stats);
+
+ [blitEncoder copyFromBuffer:buffer
+ sourceOffset:0
+ toBuffer:mmem.mtlBuffer
+ destinationOffset:mmem.offset
+ size:mmem.size];
+ }
+ else {
+ metal_device->mem_copy_to(mem);
+ }
+}
+
+void MetalDeviceQueue::copy_from_device(device_memory &mem)
+{
+ assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+ if (mem.memory_size() == 0) {
+ return;
+ }
+
+ assert(mem.device_pointer != 0);
+ assert(mem.host_pointer != nullptr);
+
+ std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
+ MetalDevice::MetalMem &mmem = *metal_device->metal_mem_map.at(&mem);
+ if (mmem.mtlBuffer) {
+ const size_t size = mem.memory_size();
+
+ if (mem.device_pointer) {
+ if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
+ id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
+ [blitEncoder synchronizeResource:mmem.mtlBuffer];
+ }
+ if (mem.host_pointer != mmem.hostPtr) {
+ if (mtlCommandBuffer) {
+ copy_back_mem.push_back({mem.host_pointer, mmem.hostPtr, size});
+ }
+ else {
+ memcpy((uchar *)mem.host_pointer, (uchar *)mmem.hostPtr, size);
+ }
+ }
+ }
+ else {
+ memset((char *)mem.host_pointer, 0, size);
+ }
+ }
+ else {
+ metal_device->mem_copy_from(mem);
+ }
+}
+
+bool MetalDeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+ return metal_device->kernels.available(kernel);
+}
+
+void MetalDeviceQueue::prepare_resources(DeviceKernel kernel)
+{
+ std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
+
+ /* declare resource usage */
+ for (auto &it : metal_device->metal_mem_map) {
+ device_memory *mem = it.first;
+
+ MTLResourceUsage usage = MTLResourceUsageRead;
+ if (mem->type != MEM_GLOBAL && mem->type != MEM_READ_ONLY && mem->type != MEM_TEXTURE) {
+ usage |= MTLResourceUsageWrite;
+ }
+
+ if (it.second->mtlBuffer) {
+ /* METAL_WIP - use array version (i.e. useResources) */
+ [mtlComputeEncoder useResource:it.second->mtlBuffer usage:usage];
+ }
+ else if (it.second->mtlTexture) {
+ /* METAL_WIP - use array version (i.e. useResources) */
+ [mtlComputeEncoder useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample];
+ }
+ }
+
+ /* ancillaries */
+ [mtlComputeEncoder useResource:metal_device->texture_bindings_2d usage:MTLResourceUsageRead];
+ [mtlComputeEncoder useResource:metal_device->texture_bindings_3d usage:MTLResourceUsageRead];
+}
+
+id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)
+{
+ bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM);
+
+ if (mtlComputeEncoder) {
+ if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
+ MTLDispatchTypeSerial) {
+ /* declare usage of MTLBuffers etc */
+ prepare_resources(kernel);
+
+ return mtlComputeEncoder;
+ }
+ close_compute_encoder();
+ }
+
+ close_blit_encoder();
+
+ if (!mtlCommandBuffer) {
+ mtlCommandBuffer = [mtlCommandQueue commandBuffer];
+ [mtlCommandBuffer retain];
+ }
+
+ mtlComputeEncoder = [mtlCommandBuffer
+ computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
+ MTLDispatchTypeSerial];
+
+ /* declare usage of MTLBuffers etc */
+ prepare_resources(kernel);
+
+ return mtlComputeEncoder;
+}
+
+id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
+{
+ if (mtlBlitEncoder) {
+ return mtlBlitEncoder;
+ }
+
+ if (mtlComputeEncoder) {
+ close_compute_encoder();
+ }
+
+ if (!mtlCommandBuffer) {
+ mtlCommandBuffer = [mtlCommandQueue commandBuffer];
+ [mtlCommandBuffer retain];
+ }
+
+ mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder];
+ return mtlBlitEncoder;
+}
+
+void MetalDeviceQueue::close_compute_encoder()
+{
+ [mtlComputeEncoder endEncoding];
+ mtlComputeEncoder = nil;
+}
+
+void MetalDeviceQueue::close_blit_encoder()
+{
+ if (mtlBlitEncoder) {
+ [mtlBlitEncoder endEncoding];
+ mtlBlitEncoder = nil;
+ }
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h
new file mode 100644
index 00000000000..dbeb3a5d064
--- /dev/null
+++ b/intern/cycles/device/metal/util.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_METAL
+
+# include <Metal/Metal.h>
+# include <string>
+
+# include "device/metal/device.h"
+# include "device/metal/kernel.h"
+# include "device/queue.h"
+
+# include "util/thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum MetalGPUVendor {
+ METAL_GPU_UNKNOWN = 0,
+ METAL_GPU_APPLE = 1,
+ METAL_GPU_AMD = 2,
+ METAL_GPU_INTEL = 3,
+};
+
+/* Retains a named MTLDevice for device enumeration. */
+struct MetalPlatformDevice {
+ MetalPlatformDevice(id<MTLDevice> device, const string &device_name)
+ : device_id(device), device_name(device_name)
+ {
+ [device_id retain];
+ }
+ ~MetalPlatformDevice()
+ {
+ [device_id release];
+ }
+ id<MTLDevice> device_id;
+ string device_name;
+};
+
+/* Contains static Metal helper functions. */
+struct MetalInfo {
+ static bool device_version_check(id<MTLDevice> device);
+ static void get_usable_devices(vector<MetalPlatformDevice> *usable_devices);
+ static MetalGPUVendor get_vendor_from_device_name(string const &device_name);
+
+ /* Platform information. */
+ static bool get_num_devices(uint32_t *num_platforms);
+ static uint32_t get_num_devices();
+
+ static bool get_device_name(id<MTLDevice> device_id, string *device_name);
+ static string get_device_name(id<MTLDevice> device_id);
+};
+
+/* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */
+class MetalBufferPool {
+ struct MetalBufferListEntry {
+ MetalBufferListEntry(id<MTLBuffer> buffer, id<MTLCommandBuffer> command_buffer)
+ : buffer(buffer), command_buffer(command_buffer)
+ {
+ }
+
+ MetalBufferListEntry() = delete;
+
+ id<MTLBuffer> buffer;
+ id<MTLCommandBuffer> command_buffer;
+ };
+ std::vector<MetalBufferListEntry> buffer_free_list;
+ std::vector<MetalBufferListEntry> buffer_in_use_list;
+ thread_mutex buffer_mutex;
+ size_t total_temp_mem_size = 0;
+
+ public:
+ MetalBufferPool() = default;
+ ~MetalBufferPool();
+
+ id<MTLBuffer> get_buffer(id<MTLDevice> device,
+ id<MTLCommandBuffer> command_buffer,
+ NSUInteger length,
+ MTLResourceOptions options,
+ const void *pointer,
+ Stats &stats);
+ void process_command_buffer_completion(id<MTLCommandBuffer> command_buffer);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
new file mode 100644
index 00000000000..d2143c389dc
--- /dev/null
+++ b/intern/cycles/device/metal/util.mm
@@ -0,0 +1,241 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+# include "device/metal/util.h"
+# include "device/metal/device_impl.h"
+# include "util/md5.h"
+# include "util/path.h"
+# include "util/string.h"
+# include "util/time.h"
+
+# include <pwd.h>
+# include <sys/shm.h>
+# include <time.h>
+
+CCL_NAMESPACE_BEGIN
+
+MetalGPUVendor MetalInfo::get_vendor_from_device_name(string const &device_name)
+{
+ if (device_name.find("Intel") != string::npos) {
+ return METAL_GPU_INTEL;
+ }
+ else if (device_name.find("AMD") != string::npos) {
+ return METAL_GPU_AMD;
+ }
+ else if (device_name.find("Apple") != string::npos) {
+ return METAL_GPU_APPLE;
+ }
+ return METAL_GPU_UNKNOWN;
+}
+
+bool MetalInfo::device_version_check(id<MTLDevice> device)
+{
+ if (@available(macos 12.0, *)) {
+ MetalGPUVendor vendor = get_vendor_from_device_name([[device name] UTF8String]);
+
+ static const char *forceIntelStr = getenv("CYCLES_METAL_FORCE_INTEL");
+ bool forceIntel = forceIntelStr ? (atoi(forceIntelStr) != 0) : false;
+
+ if (forceIntel) {
+ /* return false for non-Intel GPUs to force selection of Intel */
+ if (vendor == METAL_GPU_INTEL) {
+ return true;
+ }
+ }
+ else {
+ switch (vendor) {
+ case METAL_GPU_INTEL:
+ /* isLowPower only returns true on machines that have an AMD GPU also
+ * For Intel only machines - isLowPower will return false
+ */
+ if (getenv("CYCLES_METAL_ALLOW_LOW_POWER_GPUS") || !device.isLowPower) {
+ return true;
+ }
+ return false;
+ case METAL_GPU_APPLE:
+ case METAL_GPU_AMD:
+ return true;
+ default:
+ return false;
+ }
+ }
+ }
+
+ return false;
+}
+
+void MetalInfo::get_usable_devices(vector<MetalPlatformDevice> *usable_devices)
+{
+ static bool first_time = true;
+# define FIRST_VLOG(severity) \
+ if (first_time) \
+ VLOG(severity)
+
+ usable_devices->clear();
+
+ NSArray<id<MTLDevice>> *allDevices = MTLCopyAllDevices();
+ for (id<MTLDevice> device in allDevices) {
+ string device_name;
+ if (!get_device_name(device, &device_name)) {
+ FIRST_VLOG(2) << "Failed to get device name, ignoring.";
+ continue;
+ }
+
+ static const char *forceIntelStr = getenv("CYCLES_METAL_FORCE_INTEL");
+ bool forceIntel = forceIntelStr ? (atoi(forceIntelStr) != 0) : false;
+ if (forceIntel && device_name.find("Intel") == string::npos) {
+ FIRST_VLOG(2) << "CYCLES_METAL_FORCE_INTEL causing non-Intel device " << device_name
+ << " to be ignored.";
+ continue;
+ }
+
+ if (!device_version_check(device)) {
+ FIRST_VLOG(2) << "Ignoring device " << device_name << " due to too old compiler version.";
+ continue;
+ }
+ FIRST_VLOG(2) << "Adding new device " << device_name << ".";
+ string hardware_id;
+ usable_devices->push_back(MetalPlatformDevice(device, device_name));
+ }
+ first_time = false;
+}
+
+bool MetalInfo::get_num_devices(uint32_t *num_devices)
+{
+ *num_devices = MTLCopyAllDevices().count;
+ return true;
+}
+
+uint32_t MetalInfo::get_num_devices()
+{
+ uint32_t num_devices;
+ if (!get_num_devices(&num_devices)) {
+ return 0;
+ }
+ return num_devices;
+}
+
+bool MetalInfo::get_device_name(id<MTLDevice> device, string *platform_name)
+{
+ *platform_name = [device.name UTF8String];
+ return true;
+}
+
+string MetalInfo::get_device_name(id<MTLDevice> device)
+{
+ string platform_name;
+ if (!get_device_name(device, &platform_name)) {
+ return "";
+ }
+ return platform_name;
+}
+
+id<MTLBuffer> MetalBufferPool::get_buffer(id<MTLDevice> device,
+ id<MTLCommandBuffer> command_buffer,
+ NSUInteger length,
+ MTLResourceOptions options,
+ const void *pointer,
+ Stats &stats)
+{
+ id<MTLBuffer> buffer;
+
+ MTLStorageMode storageMode = MTLStorageMode((options & MTLResourceStorageModeMask) >>
+ MTLResourceStorageModeShift);
+ MTLCPUCacheMode cpuCacheMode = MTLCPUCacheMode((options & MTLResourceCPUCacheModeMask) >>
+ MTLResourceCPUCacheModeShift);
+
+ buffer_mutex.lock();
+ for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end(); entry++) {
+ MetalBufferListEntry bufferEntry = *entry;
+
+ /* Check if buffer matches size and storage mode and is old enough to reuse */
+ if (bufferEntry.buffer.length == length && storageMode == bufferEntry.buffer.storageMode &&
+ cpuCacheMode == bufferEntry.buffer.cpuCacheMode) {
+ buffer = bufferEntry.buffer;
+ buffer_free_list.erase(entry);
+ bufferEntry.command_buffer = command_buffer;
+ buffer_in_use_list.push_back(bufferEntry);
+ buffer_mutex.unlock();
+
+ /* Copy over data */
+ if (pointer) {
+ memcpy(buffer.contents, pointer, length);
+ if (bufferEntry.buffer.storageMode == MTLStorageModeManaged) {
+ [buffer didModifyRange:NSMakeRange(0, length)];
+ }
+ }
+
+ return buffer;
+ }
+ }
+ // NSLog(@"Creating buffer of length %lu (%lu)", length, frameCount);
+ if (pointer) {
+ buffer = [device newBufferWithBytes:pointer length:length options:options];
+ }
+ else {
+ buffer = [device newBufferWithLength:length options:options];
+ }
+
+ MetalBufferListEntry buffer_entry(buffer, command_buffer);
+
+ stats.mem_alloc(buffer.allocatedSize);
+
+ total_temp_mem_size += buffer.allocatedSize;
+ buffer_in_use_list.push_back(buffer_entry);
+ buffer_mutex.unlock();
+
+ return buffer;
+}
+
+void MetalBufferPool::process_command_buffer_completion(id<MTLCommandBuffer> command_buffer)
+{
+ assert(command_buffer);
+ thread_scoped_lock lock(buffer_mutex);
+ /* Release all buffers that have not been recently reused back into the free pool */
+ for (auto entry = buffer_in_use_list.begin(); entry != buffer_in_use_list.end();) {
+ MetalBufferListEntry buffer_entry = *entry;
+ if (buffer_entry.command_buffer == command_buffer) {
+ entry = buffer_in_use_list.erase(entry);
+ buffer_entry.command_buffer = nil;
+ buffer_free_list.push_back(buffer_entry);
+ }
+ else {
+ entry++;
+ }
+ }
+}
+
+MetalBufferPool::~MetalBufferPool()
+{
+ thread_scoped_lock lock(buffer_mutex);
+ /* Release all buffers that have not been recently reused */
+ for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end();) {
+ MetalBufferListEntry buffer_entry = *entry;
+
+ id<MTLBuffer> buffer = buffer_entry.buffer;
+ // NSLog(@"Releasing buffer of length %lu (%lu) (%lu outstanding)", buffer.length, frameCount,
+ // bufferFreeList.size());
+ total_temp_mem_size -= buffer.allocatedSize;
+ [buffer release];
+ entry = buffer_free_list.erase(entry);
+ }
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
index 2513df63489..baab84f4035 100644
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -182,6 +182,9 @@ class MultiDevice : public Device {
else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
BVH_LAYOUT_EMBREE;
+ else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE)
+ params.bvh_layout = sub.device->info.type == DEVICE_METAL ? BVH_LAYOUT_METAL :
+ BVH_LAYOUT_EMBREE;
/* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
* (since they are put into the top level directly, see bvh_embree.cpp) */
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 0f88063e3b7..b50f492e8c7 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -19,7 +19,6 @@
#include "kernel/device/gpu/parallel_active_index.h"
#include "kernel/device/gpu/parallel_prefix_sum.h"
#include "kernel/device/gpu/parallel_sorted_index.h"
-#include "kernel/device/gpu/work_stealing.h"
#include "kernel/sample/lcg.h"
@@ -30,6 +29,8 @@
# include "kernel/device/metal/context_begin.h"
#endif
+#include "kernel/device/gpu/work_stealing.h"
+
#include "kernel/integrator/state.h"
#include "kernel/integrator/state_flow.h"
#include "kernel/integrator/state_util.h"
@@ -96,7 +97,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int state = tile->path_index_offset + tile_work_index;
uint x, y, sample;
- get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+ ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
ccl_gpu_kernel_call(
integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
@@ -127,7 +128,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int state = tile->path_index_offset + tile_work_index;
uint x, y, sample;
- get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+ ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
ccl_gpu_kernel_call(
integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index a80965ba267..a51afc37fc0 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -117,7 +117,7 @@ struct kernel_gpu_##name \
uint simd_group_index, \
uint num_simd_groups) ccl_global const; \
}; \
-kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
+kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
constant KernelParamsMetal &ccl_restrict _launch_params_metal, \
constant MetalAncillaries *_metal_ancillaries, \
threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
index ba80238bb84..27dc1f44c6f 100644
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -126,7 +126,7 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
[[intersection(triangle, triangle_data, METALRT_TAGS)]]
TriangleIntersectionResult
-__anyhit__kernel_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
uint instance_id [[user_instance_id]],
uint primitive_id [[primitive_id]],
@@ -139,7 +139,7 @@ __anyhit__kernel_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params
[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
BoundingBoxIntersectionResult
-__anyhit__kernel_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
+__anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
{
/* unused function */
BoundingBoxIntersectionResult result;
@@ -274,7 +274,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
[[intersection(triangle, triangle_data, METALRT_TAGS)]]
TriangleIntersectionResult
-__anyhit__kernel_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
unsigned int object [[user_instance_id]],
unsigned int primitive_id [[primitive_id]],
@@ -292,7 +292,7 @@ __anyhit__kernel_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_p
[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
BoundingBoxIntersectionResult
-__anyhit__kernel_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
+__anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
{
/* unused function */
BoundingBoxIntersectionResult result;
@@ -345,7 +345,7 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa
[[intersection(triangle, triangle_data, METALRT_TAGS)]]
TriangleIntersectionResult
-__anyhit__kernel_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
unsigned int object [[user_instance_id]],
unsigned int primitive_id [[primitive_id]])
@@ -362,7 +362,7 @@ __anyhit__kernel_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_
[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
BoundingBoxIntersectionResult
-__anyhit__kernel_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
+__anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
{
/* Unused function */
BoundingBoxIntersectionResult result;
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index 6cfeb1aa917..18b60b70a4b 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -719,6 +719,20 @@ ccl_device_inline float pow22(float a)
return sqr(a * sqr(sqr(sqr(a)) * a));
}
+#ifdef __KERNEL_METAL__
+ccl_device_inline float lgammaf(float x)
+{
+ /* Nemes, Gergő (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik
+ */
+ const float _1_180 = 1.0f / 180.0f;
+ const float log2pi = 1.83787706641f;
+ const float logx = log(x);
+ return (log2pi - logx +
+ x * (logx * 2.0f + log(x * sinh(1.0f / x) + (_1_180 / pow(x, 6.0f))) - 2.0f)) *
+ 0.5f;
+}
+#endif
+
ccl_device_inline float beta(float x, float y)
{
return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y));
diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp
index aad790482d5..243b3187f2e 100644
--- a/intern/cycles/util/path.cpp
+++ b/intern/cycles/util/path.cpp
@@ -750,6 +750,170 @@ bool path_remove(const string &path)
return remove(path.c_str()) == 0;
}
+struct SourceReplaceState {
+ typedef map<string, string> ProcessedMapping;
+ /* Base director for all relative include headers. */
+ string base;
+ /* Result of processed files. */
+ ProcessedMapping processed_files;
+ /* Set of files containing #pragma once which have been included. */
+ set<string> pragma_onced;
+};
+
+static string path_source_replace_includes_recursive(const string &source,
+ const string &source_filepath,
+ SourceReplaceState *state);
+
+static string path_source_handle_preprocessor(const string &preprocessor_line,
+ const string &source_filepath,
+ const size_t line_number,
+ SourceReplaceState *state)
+{
+ string result = preprocessor_line;
+
+ string rest_of_line = string_strip(preprocessor_line.substr(1));
+
+ if (0 == strncmp(rest_of_line.c_str(), "include", 7)) {
+ rest_of_line = string_strip(rest_of_line.substr(8));
+ if (rest_of_line[0] == '"') {
+ const size_t n_start = 1;
+ const size_t n_end = rest_of_line.find("\"", n_start);
+ const string filename = rest_of_line.substr(n_start, n_end - n_start);
+
+ string filepath = path_join(state->base, filename);
+ if (!path_exists(filepath)) {
+ filepath = path_join(path_dirname(source_filepath), filename);
+ }
+ string text;
+ if (path_read_text(filepath, text)) {
+ text = path_source_replace_includes_recursive(text, filepath, state);
+ /* Use line directives for better error messages. */
+ return "\n" + text + "\n";
+ }
+ }
+ }
+
+ return result;
+}
+
+/* Our own little c preprocessor that replaces #includes with the file
+ * contents, to work around issue of OpenCL drivers not supporting
+ * include paths with spaces in them.
+ */
+static string path_source_replace_includes_recursive(const string &_source,
+ const string &source_filepath,
+ SourceReplaceState *state)
+{
+ const string *psource = &_source;
+ string source_new;
+
+ auto pragma_once = _source.find("#pragma once");
+ if (pragma_once != string::npos) {
+ if (state->pragma_onced.find(source_filepath) != state->pragma_onced.end()) {
+ return "";
+ }
+ state->pragma_onced.insert(source_filepath);
+
+ // "#pragma once"
+ // "//prgma once"
+ source_new = _source;
+ memcpy(source_new.data() + pragma_once, "//pr", 4);
+ psource = &source_new;
+ }
+
+ /* Try to re-use processed file without spending time on replacing all
+ * include directives again.
+ */
+ SourceReplaceState::ProcessedMapping::iterator replaced_file = state->processed_files.find(
+ source_filepath);
+ if (replaced_file != state->processed_files.end()) {
+ return replaced_file->second;
+ }
+
+ const string &source = *psource;
+
+ /* Perform full file processing. */
+ string result = "";
+ const size_t source_length = source.length();
+ size_t index = 0;
+ /* Information about where we are in the source. */
+ size_t line_number = 0, column_number = 1;
+ /* Currently gathered non-preprocessor token.
+ * Store as start/length rather than token itself to avoid overhead of
+ * memory re-allocations on each character concatenation.
+ */
+ size_t token_start = 0, token_length = 0;
+ /* Denotes whether we're inside of preprocessor line, together with
+ * preprocessor line itself.
+ *
+ * TODO(sergey): Investigate whether using token start/end position
+ * gives measurable speedup.
+ */
+ bool inside_preprocessor = false;
+ string preprocessor_line = "";
+ /* Actual loop over the whole source. */
+ while (index < source_length) {
+ char ch = source[index];
+
+ if (ch == '\n') {
+ if (inside_preprocessor) {
+ string block = path_source_handle_preprocessor(
+ preprocessor_line, source_filepath, line_number, state);
+
+ if (!block.empty()) {
+ result += block;
+ }
+
+ /* Start gathering net part of the token. */
+ token_start = index;
+ token_length = 0;
+ inside_preprocessor = false;
+ preprocessor_line = "";
+ }
+ column_number = 0;
+ ++line_number;
+ }
+ else if (ch == '#' && column_number == 1 && !inside_preprocessor) {
+ /* Append all possible non-preprocessor token to the result. */
+ if (token_length != 0) {
+ result.append(source, token_start, token_length);
+ token_start = index;
+ token_length = 0;
+ }
+ inside_preprocessor = true;
+ }
+
+ if (inside_preprocessor) {
+ preprocessor_line += ch;
+ }
+ else {
+ ++token_length;
+ }
+ ++index;
+ ++column_number;
+ }
+ /* Append possible tokens which happened before special events handled
+ * above.
+ */
+ if (token_length != 0) {
+ result.append(source, token_start, token_length);
+ }
+ if (inside_preprocessor) {
+ result += path_source_handle_preprocessor(
+ preprocessor_line, source_filepath, line_number, state);
+ }
+ /* Store result for further reuse. */
+ state->processed_files[source_filepath] = result;
+ return result;
+}
+
+string path_source_replace_includes(const string &source, const string &path)
+{
+ SourceReplaceState state;
+ state.base = path;
+ return path_source_replace_includes_recursive(source, path, &state);
+}
+
FILE *path_fopen(const string &path, const string &mode)
{
#ifdef _WIN32
diff --git a/intern/cycles/util/path.h b/intern/cycles/util/path.h
index a1394555302..7ec5ed60d7f 100644
--- a/intern/cycles/util/path.h
+++ b/intern/cycles/util/path.h
@@ -66,6 +66,9 @@ bool path_read_text(const string &path, string &text);
/* File manipulation. */
bool path_remove(const string &path);
+/* source code utility */
+string path_source_replace_includes(const string &source, const string &path);
+
/* cache utility */
void path_cache_clear_except(const string &name, const set<string> &except);