From 9558fa5196033390111a2348caa66ab18b8a4f89 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Tue, 7 Dec 2021 15:11:35 +0000 Subject: Cycles: Metal host-side code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds the Metal host-side code: - Add all core host-side Metal backend files (device_impl, queue, etc) - Add MetalRT BVH setup files - Integrate with Cycles device enumeration code - Revive `path_source_replace_includes` in util/path (required for MSL compilation) This patch also includes a couple of small kernel-side fixes: - Add an implementation of `lgammaf` for Metal [Nemes, GergÅ‘ (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik](https://users.renyi.hu/~gergonemes/) - include "work_stealing.h" inside the Metal context class because it accesses state now Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13423 --- intern/cycles/blender/CMakeLists.txt | 5 + intern/cycles/blender/addon/engine.py | 2 +- intern/cycles/blender/addon/properties.py | 12 +- intern/cycles/blender/addon/ui.py | 5 + intern/cycles/blender/device.cpp | 4 + intern/cycles/blender/python.cpp | 9 +- intern/cycles/bvh/CMakeLists.txt | 2 + intern/cycles/bvh/bvh.cpp | 10 +- intern/cycles/bvh/metal.h | 35 + intern/cycles/bvh/metal.mm | 33 + intern/cycles/cmake/external_libs.cmake | 14 + intern/cycles/device/CMakeLists.txt | 49 +- intern/cycles/device/device.cpp | 40 +- intern/cycles/device/device.h | 3 + intern/cycles/device/memory.h | 1 + intern/cycles/device/metal/bvh.h | 66 ++ intern/cycles/device/metal/bvh.mm | 813 +++++++++++++++++++ intern/cycles/device/metal/device.h | 37 + intern/cycles/device/metal/device.mm | 136 ++++ intern/cycles/device/metal/device_impl.h | 166 ++++ intern/cycles/device/metal/device_impl.mm | 1008 ++++++++++++++++++++++++ intern/cycles/device/metal/kernel.h | 168 ++++ intern/cycles/device/metal/kernel.mm | 523 ++++++++++++ intern/cycles/device/metal/queue.h | 97 +++ intern/cycles/device/metal/queue.mm | 602 ++++++++++++++ intern/cycles/device/metal/util.h | 101 +++ intern/cycles/device/metal/util.mm | 241 ++++++ intern/cycles/device/multi/device.cpp | 3 + intern/cycles/kernel/device/gpu/kernel.h | 7 +- intern/cycles/kernel/device/metal/compat.h | 2 +- intern/cycles/kernel/device/metal/kernel.metal | 12 +- intern/cycles/util/math.h | 14 + intern/cycles/util/path.cpp | 164 ++++ intern/cycles/util/path.h | 3 + 34 files changed, 4355 insertions(+), 32 deletions(-) create mode 100644 intern/cycles/bvh/metal.h create mode 100644 intern/cycles/bvh/metal.mm create mode 100644 intern/cycles/device/metal/bvh.h create mode 100644 intern/cycles/device/metal/bvh.mm create mode 100644 intern/cycles/device/metal/device.h create mode 100644 intern/cycles/device/metal/device.mm create mode 100644 intern/cycles/device/metal/device_impl.h create mode 100644 intern/cycles/device/metal/device_impl.mm create mode 100644 intern/cycles/device/metal/kernel.h create mode 100644 intern/cycles/device/metal/kernel.mm create mode 100644 intern/cycles/device/metal/queue.h create mode 100644 intern/cycles/device/metal/queue.mm create mode 100644 intern/cycles/device/metal/util.h create mode 100644 intern/cycles/device/metal/util.mm diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index f0540486656..b4a4d487355 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -101,6 +101,11 @@ add_definitions(${GL_DEFINITIONS}) if(WITH_CYCLES_DEVICE_HIP) add_definitions(-DWITH_HIP) endif() + +if(WITH_CYCLES_DEVICE_METAL) + add_definitions(-DWITH_METAL) +endif() + if(WITH_MOD_FLUID) add_definitions(-DWITH_FLUID) endif() diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index e5bb77a834a..910ac4a373e 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -28,7 +28,7 @@ def _configure_argument_parser(): action='store_true') parser.add_argument("--cycles-device", help="Set the device to use for Cycles, overriding user preferences and the scene setting." - "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'" + "Valid options are 'CPU', 'CUDA', 'OPTIX', 'HIP' or 'METAL'." "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.", default=None) return parser diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 0de936ddb11..8569cb7d946 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -111,7 +111,8 @@ enum_device_type = ( ('CPU', "CPU", "CPU", 0), ('CUDA', "CUDA", "CUDA", 1), ('OPTIX', "OptiX", "OptiX", 3), - ("HIP", "HIP", "HIP", 4) + ('HIP', "HIP", "HIP", 4), + ('METAL', "Metal", "Metal", 5) ) enum_texture_limit = ( @@ -1312,8 +1313,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): def get_device_types(self, context): import _cycles - has_cuda, has_optix, has_hip = _cycles.get_device_types() - + has_cuda, has_optix, has_hip, has_metal = _cycles.get_device_types() list = [('NONE', "None", "Don't use compute device", 0)] if has_cuda: list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1)) @@ -1321,6 +1321,8 @@ class CyclesPreferences(bpy.types.AddonPreferences): list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3)) if has_hip: list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4)) + if has_metal: + list.append(('METAL', "Metal", "Use Metal for GPU acceleration", 5)) return list @@ -1346,7 +1348,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): def update_device_entries(self, device_list): for device in device_list: - if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}: + if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP', 'METAL'}: continue # Try to find existing Device entry entry = self.find_existing_device_entry(device) @@ -1390,7 +1392,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): import _cycles # Ensure `self.devices` is not re-allocated when the second call to # get_devices_for_type is made, freeing items from the first list. - for device_type in ('CUDA', 'OPTIX', 'HIP'): + for device_type in ('CUDA', 'OPTIX', 'HIP', 'METAL'): self.update_device_entries(_cycles.available_devices(device_type)) # Deprecated: use refresh_devices instead. diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 6fd21db38ae..fd86d75a301 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -97,6 +97,11 @@ def use_cpu(context): return (get_device_type(context) == 'NONE' or cscene.device == 'CPU') +def use_metal(context): + cscene = context.scene.cycles + + return (get_device_type(context) == 'METAL' and cscene.device == 'GPU') + def use_cuda(context): cscene = context.scene.cycles diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp index 9fabc33a96b..d39381ac6f1 100644 --- a/intern/cycles/blender/device.cpp +++ b/intern/cycles/blender/device.cpp @@ -27,6 +27,7 @@ enum ComputeDevice { COMPUTE_DEVICE_CUDA = 1, COMPUTE_DEVICE_OPTIX = 3, COMPUTE_DEVICE_HIP = 4, + COMPUTE_DEVICE_METAL = 5, COMPUTE_DEVICE_NUM }; @@ -85,6 +86,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen else if (compute_device == COMPUTE_DEVICE_HIP) { mask |= DEVICE_MASK_HIP; } + else if (compute_device == COMPUTE_DEVICE_METAL) { + mask |= DEVICE_MASK_METAL; + } vector devices = Device::available_devices(mask); /* Match device preferences and available devices. */ diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp index 012122cf9e3..024dae306b0 100644 --- a/intern/cycles/blender/python.cpp +++ b/intern/cycles/blender/python.cpp @@ -906,16 +906,18 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args* static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/) { vector device_types = Device::available_types(); - bool has_cuda = false, has_optix = false, has_hip = false; + bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false; foreach (DeviceType device_type, device_types) { has_cuda |= (device_type == DEVICE_CUDA); has_optix |= (device_type == DEVICE_OPTIX); has_hip |= (device_type == DEVICE_HIP); + has_metal |= (device_type == DEVICE_METAL); } - PyObject *list = PyTuple_New(3); + PyObject *list = PyTuple_New(4); PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda)); PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix)); PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip)); + PyTuple_SET_ITEM(list, 3, PyBool_FromLong(has_metal)); return list; } @@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg) else if (override == "HIP") { BlenderSession::device_override = DEVICE_MASK_HIP; } + else if (override == "METAL") { + BlenderSession::device_override = DEVICE_MASK_METAL; + } else { printf("\nError: %s is not a valid Cycles device.\n", override.c_str()); Py_RETURN_FALSE; diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index 9edc30cf9c4..cdaa6628be2 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -31,6 +31,7 @@ set(SRC sort.cpp split.cpp unaligned.cpp + metal.mm ) set(SRC_HEADERS @@ -46,6 +47,7 @@ set(SRC_HEADERS sort.h split.h unaligned.h + metal.h ) set(LIB diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index d3c8e4db6d0..540bf52f7ac 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -19,6 +19,7 @@ #include "bvh/bvh2.h" #include "bvh/embree.h" +#include "bvh/metal.h" #include "bvh/multi.h" #include "bvh/optix.h" @@ -105,13 +106,18 @@ BVH *BVH::create(const BVHParams ¶ms, #else (void)device; break; +#endif + case BVH_LAYOUT_METAL: +#ifdef WITH_METAL + return bvh_metal_create(params, geometry, objects, device); +#else + (void)device; + break; #endif case BVH_LAYOUT_MULTI_OPTIX: case BVH_LAYOUT_MULTI_OPTIX_EMBREE: case BVH_LAYOUT_MULTI_METAL_EMBREE: return new BVHMulti(params, geometry, objects); - case BVH_LAYOUT_METAL: - /* host-side changes for BVH_LAYOUT_METAL are imminent */ case BVH_LAYOUT_NONE: case BVH_LAYOUT_ALL: break; diff --git a/intern/cycles/bvh/metal.h b/intern/cycles/bvh/metal.h new file mode 100644 index 00000000000..8de07927e61 --- /dev/null +++ b/intern/cycles/bvh/metal.h @@ -0,0 +1,35 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BVH_METAL_H__ +#define __BVH_METAL_H__ + +#ifdef WITH_METAL + +# include "bvh/bvh.h" + +CCL_NAMESPACE_BEGIN + +BVH *bvh_metal_create(const BVHParams ¶ms, + const vector &geometry, + const vector &objects, + Device *device); + +CCL_NAMESPACE_END + +#endif /* WITH_METAL */ + +#endif /* __BVH_METAL_H__ */ diff --git a/intern/cycles/bvh/metal.mm b/intern/cycles/bvh/metal.mm new file mode 100644 index 00000000000..90a52012f12 --- /dev/null +++ b/intern/cycles/bvh/metal.mm @@ -0,0 +1,33 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_METAL + +# include "device/metal/bvh.h" + +CCL_NAMESPACE_BEGIN + +BVH *bvh_metal_create(const BVHParams ¶ms, + const vector &geometry, + const vector &objects, + Device *device) +{ + return new BVHMetal(params, geometry, objects, device); +} + +CCL_NAMESPACE_END + +#endif /* WITH_METAL */ diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index c1244ab740b..9967a775184 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -551,4 +551,18 @@ if(NOT WITH_HIP_DYNLOAD) set(WITH_HIP_DYNLOAD ON) endif() +########################################################################### +# Metal +########################################################################### + +if(WITH_CYCLES_DEVICE_METAL) + FIND_LIBRARY(METAL_LIBRARY Metal) + if (METAL_LIBRARY) + message(STATUS "Found Metal: ${METAL_LIBRARY}") + else() + message(STATUS "Metal not found, disabling WITH_CYCLES_DEVICE_METAL") + set(WITH_CYCLES_DEVICE_METAL OFF) + endif() +endif() + unset(_cycles_lib_dir) diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 99b1fc8135d..7f1e9ff3d0f 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -43,7 +43,7 @@ if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD) add_definitions(-DWITH_HIP_DYNLOAD) endif() -set(SRC +set(SRC_BASE device.cpp denoise.cpp graphics_interop.cpp @@ -104,6 +104,21 @@ set(SRC_MULTI multi/device.h ) +set(SRC_METAL + metal/bvh.mm + metal/bvh.h + metal/device.mm + metal/device.h + metal/device_impl.mm + metal/device_impl.h + metal/kernel.mm + metal/kernel.h + metal/queue.mm + metal/queue.h + metal/util.mm + metal/util.h +) + set(SRC_OPTIX optix/device.cpp optix/device.h @@ -123,6 +138,17 @@ set(SRC_HEADERS queue.h ) +set(SRC + ${SRC_BASE} + ${SRC_CPU} + ${SRC_CUDA} + ${SRC_HIP} + ${SRC_DUMMY} + ${SRC_MULTI} + ${SRC_OPTIX} + ${SRC_HEADERS} +) + set(LIB cycles_kernel cycles_util @@ -158,6 +184,15 @@ endif() if(WITH_CYCLES_DEVICE_OPTIX) add_definitions(-DWITH_OPTIX) endif() +if(WITH_CYCLES_DEVICE_METAL) + list(APPEND LIB + ${METAL_LIBRARY} + ) + add_definitions(-DWITH_METAL) + list(APPEND SRC + ${SRC_METAL} + ) +endif() if(WITH_OPENIMAGEDENOISE) list(APPEND LIB @@ -168,20 +203,12 @@ endif() include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -cycles_add_library(cycles_device "${LIB}" - ${SRC} - ${SRC_CPU} - ${SRC_CUDA} - ${SRC_HIP} - ${SRC_DUMMY} - ${SRC_MULTI} - ${SRC_OPTIX} - ${SRC_HEADERS} -) +cycles_add_library(cycles_device "${LIB}" ${SRC}) source_group("cpu" FILES ${SRC_CPU}) source_group("cuda" FILES ${SRC_CUDA}) source_group("dummy" FILES ${SRC_DUMMY}) source_group("multi" FILES ${SRC_MULTI}) +source_group("metal" FILES ${SRC_METAL}) source_group("optix" FILES ${SRC_OPTIX}) source_group("common" FILES ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index bfbcdb20d5e..2b067d57158 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -27,6 +27,7 @@ #include "device/cuda/device.h" #include "device/dummy/device.h" #include "device/hip/device.h" +#include "device/metal/device.h" #include "device/multi/device.h" #include "device/optix/device.h" @@ -49,6 +50,7 @@ vector Device::cuda_devices; vector Device::optix_devices; vector Device::cpu_devices; vector Device::hip_devices; +vector Device::metal_devices; uint Device::devices_initialized_mask = 0; /* Device */ @@ -105,6 +107,12 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler) break; #endif +#ifdef WITH_METAL + case DEVICE_METAL: + if (device_metal_init()) + device = device_metal_create(info, stats, profiler); + break; +#endif default: break; } @@ -128,6 +136,8 @@ DeviceType Device::type_from_string(const char *name) return DEVICE_MULTI; else if (strcmp(name, "HIP") == 0) return DEVICE_HIP; + else if (strcmp(name, "METAL") == 0) + return DEVICE_METAL; return DEVICE_NONE; } @@ -144,6 +154,8 @@ string Device::string_from_type(DeviceType type) return "MULTI"; else if (type == DEVICE_HIP) return "HIP"; + else if (type == DEVICE_METAL) + return "METAL"; return ""; } @@ -161,7 +173,9 @@ vector Device::available_types() #ifdef WITH_HIP types.push_back(DEVICE_HIP); #endif - +#ifdef WITH_METAL + types.push_back(DEVICE_METAL); +#endif return types; } @@ -227,6 +241,20 @@ vector Device::available_devices(uint mask) } } +#ifdef WITH_METAL + if (mask & DEVICE_MASK_METAL) { + if (!(devices_initialized_mask & DEVICE_MASK_METAL)) { + if (device_metal_init()) { + device_metal_info(metal_devices); + } + devices_initialized_mask |= DEVICE_MASK_METAL; + } + foreach (DeviceInfo &info, metal_devices) { + devices.push_back(info); + } + } +#endif + return devices; } @@ -266,6 +294,15 @@ string Device::device_capabilities(uint mask) } #endif +#ifdef WITH_METAL + if (mask & DEVICE_MASK_METAL) { + if (device_metal_init()) { + capabilities += "\nMetal device capabilities:\n"; + capabilities += device_metal_capabilities(); + } + } +#endif + return capabilities; } @@ -354,6 +391,7 @@ void Device::free_memory() optix_devices.free_memory(); hip_devices.free_memory(); cpu_devices.free_memory(); + metal_devices.free_memory(); } unique_ptr Device::gpu_queue_create() diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 346632de314..c032773ddd0 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -52,6 +52,7 @@ enum DeviceType { DEVICE_MULTI, DEVICE_OPTIX, DEVICE_HIP, + DEVICE_METAL, DEVICE_DUMMY, }; @@ -60,6 +61,7 @@ enum DeviceTypeMask { DEVICE_MASK_CUDA = (1 << DEVICE_CUDA), DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX), DEVICE_MASK_HIP = (1 << DEVICE_HIP), + DEVICE_MASK_METAL = (1 << DEVICE_METAL), DEVICE_MASK_ALL = ~0 }; @@ -281,6 +283,7 @@ class Device { static vector optix_devices; static vector cpu_devices; static vector hip_devices; + static vector metal_devices; static uint devices_initialized_mask; }; diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h index d3ebebb34b4..2db3ac9a440 100644 --- a/intern/cycles/device/memory.h +++ b/intern/cycles/device/memory.h @@ -263,6 +263,7 @@ class device_memory { friend class CUDADevice; friend class OptiXDevice; friend class HIPDevice; + friend class MetalDevice; /* Only create through subclasses. */ device_memory(Device *device, const char *name, MemoryType type); diff --git a/intern/cycles/device/metal/bvh.h b/intern/cycles/device/metal/bvh.h new file mode 100644 index 00000000000..cbc5ca7d2c3 --- /dev/null +++ b/intern/cycles/device/metal/bvh.h @@ -0,0 +1,66 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_METAL + +# include "bvh/bvh.h" +# include "bvh/params.h" +# include "device/memory.h" + +# include + +CCL_NAMESPACE_BEGIN + +class BVHMetal : public BVH { + public: + API_AVAILABLE(macos(11.0)) + id accel_struct = nil; + bool accel_struct_building = false; + + API_AVAILABLE(macos(11.0)) + vector> blas_array; + + bool motion_blur = false; + + Stats &stats; + + bool build(Progress &progress, id device, id queue, bool refit); + + BVHMetal(const BVHParams ¶ms, + const vector &geometry, + const vector &objects, + Device *device); + virtual ~BVHMetal(); + + bool build_BLAS(Progress &progress, id device, id queue, bool refit); + bool build_BLAS_mesh(Progress &progress, + id device, + id queue, + Geometry *const geom, + bool refit); + bool build_BLAS_hair(Progress &progress, + id device, + id queue, + Geometry *const geom, + bool refit); + bool build_TLAS(Progress &progress, id device, id queue, bool refit); +}; + +CCL_NAMESPACE_END + +#endif /* WITH_METAL */ diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm new file mode 100644 index 00000000000..7c79196845a --- /dev/null +++ b/intern/cycles/device/metal/bvh.mm @@ -0,0 +1,813 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_METAL + +# include "scene/hair.h" +# include "scene/mesh.h" +# include "scene/object.h" + +# include "util/progress.h" + +# include "device/metal/bvh.h" + +CCL_NAMESPACE_BEGIN + +# define BVH_status(...) \ + { \ + string str = string_printf(__VA_ARGS__); \ + progress.set_substatus(str); \ + } + +BVHMetal::BVHMetal(const BVHParams ¶ms_, + const vector &geometry_, + const vector &objects_, + Device *device) + : BVH(params_, geometry_, objects_), stats(device->stats) +{ +} + +BVHMetal::~BVHMetal() +{ + if (@available(macos 12.0, *)) { + if (accel_struct) { + stats.mem_free(accel_struct.allocatedSize); + [accel_struct release]; + } + } +} + +bool BVHMetal::build_BLAS_mesh(Progress &progress, + id device, + id queue, + Geometry *const geom, + bool refit) +{ + if (@available(macos 12.0, *)) { + /* Build BLAS for triangle primitives */ + Mesh *const mesh = static_cast(geom); + if (mesh->num_triangles() == 0) { + return false; + } + + /*------------------------------------------------*/ + BVH_status( + "Building mesh BLAS | %7d tris | %s", (int)mesh->num_triangles(), geom->name.c_str()); + /*------------------------------------------------*/ + + const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC); + + const array &verts = mesh->get_verts(); + const array &tris = mesh->get_triangles(); + const size_t num_verts = verts.size(); + const size_t num_indices = tris.size(); + + size_t num_motion_steps = 1; + Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if (motion_blur && mesh->get_use_motion_blur() && motion_keys) { + num_motion_steps = mesh->get_motion_steps(); + } + + MTLResourceOptions storage_mode; + if (device.hasUnifiedMemory) { + storage_mode = MTLResourceStorageModeShared; + } + else { + storage_mode = MTLResourceStorageModeManaged; + } + + /* Upload the mesh data to the GPU */ + id posBuf = nil; + id indexBuf = [device newBufferWithBytes:tris.data() + length:num_indices * sizeof(tris.data()[0]) + options:storage_mode]; + + if (num_motion_steps == 1) { + posBuf = [device newBufferWithBytes:verts.data() + length:num_verts * sizeof(verts.data()[0]) + options:storage_mode]; + } + else { + posBuf = [device newBufferWithLength:num_verts * num_motion_steps * sizeof(verts.data()[0]) + options:storage_mode]; + float3 *dest_data = (float3 *)[posBuf contents]; + size_t center_step = (num_motion_steps - 1) / 2; + for (size_t step = 0; step < num_motion_steps; ++step) { + const float3 *verts = mesh->get_verts().data(); + + /* The center step for motion vertices is not stored in the attribute. */ + if (step != center_step) { + verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts; + } + memcpy(dest_data + num_verts * step, verts, num_verts * sizeof(float3)); + } + if (storage_mode == MTLResourceStorageModeManaged) { + [posBuf didModifyRange:NSMakeRange(0, posBuf.length)]; + } + } + + /* Create an acceleration structure. */ + MTLAccelerationStructureGeometryDescriptor *geomDesc; + if (num_motion_steps > 1) { + std::vector vertex_ptrs; + vertex_ptrs.reserve(num_motion_steps); + for (size_t step = 0; step < num_motion_steps; ++step) { + MTLMotionKeyframeData *k = [MTLMotionKeyframeData data]; + k.buffer = posBuf; + k.offset = num_verts * step * sizeof(float3); + vertex_ptrs.push_back(k); + } + + MTLAccelerationStructureMotionTriangleGeometryDescriptor *geomDescMotion = + [MTLAccelerationStructureMotionTriangleGeometryDescriptor descriptor]; + geomDescMotion.vertexBuffers = [NSArray arrayWithObjects:vertex_ptrs.data() + count:vertex_ptrs.size()]; + geomDescMotion.vertexStride = sizeof(verts.data()[0]); + geomDescMotion.indexBuffer = indexBuf; + geomDescMotion.indexBufferOffset = 0; + geomDescMotion.indexType = MTLIndexTypeUInt32; + geomDescMotion.triangleCount = num_indices / 3; + geomDescMotion.intersectionFunctionTableOffset = 0; + + geomDesc = geomDescMotion; + } + else { + MTLAccelerationStructureTriangleGeometryDescriptor *geomDescNoMotion = + [MTLAccelerationStructureTriangleGeometryDescriptor descriptor]; + geomDescNoMotion.vertexBuffer = posBuf; + geomDescNoMotion.vertexBufferOffset = 0; + geomDescNoMotion.vertexStride = sizeof(verts.data()[0]); + geomDescNoMotion.indexBuffer = indexBuf; + geomDescNoMotion.indexBufferOffset = 0; + geomDescNoMotion.indexType = MTLIndexTypeUInt32; + geomDescNoMotion.triangleCount = num_indices / 3; + geomDescNoMotion.intersectionFunctionTableOffset = 0; + + geomDesc = geomDescNoMotion; + } + + /* Force a single any-hit call, so shadow record-all behavior works correctly */ + /* (Match optix behaviour: unsigned int build_flags = + * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */ + geomDesc.allowDuplicateIntersectionFunctionInvocation = false; + + MTLPrimitiveAccelerationStructureDescriptor *accelDesc = + [MTLPrimitiveAccelerationStructureDescriptor descriptor]; + accelDesc.geometryDescriptors = @[ geomDesc ]; + if (num_motion_steps > 1) { + accelDesc.motionStartTime = 0.0f; + accelDesc.motionEndTime = 1.0f; + accelDesc.motionStartBorderMode = MTLMotionBorderModeClamp; + accelDesc.motionEndBorderMode = MTLMotionBorderModeClamp; + accelDesc.motionKeyframeCount = num_motion_steps; + } + + if (!use_fast_trace_bvh) { + accelDesc.usage |= (MTLAccelerationStructureUsageRefit | + MTLAccelerationStructureUsagePreferFastBuild); + } + + MTLAccelerationStructureSizes accelSizes = [device + accelerationStructureSizesWithDescriptor:accelDesc]; + id accel_uncompressed = [device + newAccelerationStructureWithSize:accelSizes.accelerationStructureSize]; + id scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize + options:MTLResourceStorageModePrivate]; + id sizeBuf = [device newBufferWithLength:8 options:MTLResourceStorageModeShared]; + id accelCommands = [queue commandBuffer]; + id accelEnc = + [accelCommands accelerationStructureCommandEncoder]; + if (refit) { + [accelEnc refitAccelerationStructure:accel_struct + descriptor:accelDesc + destination:accel_uncompressed + scratchBuffer:scratchBuf + scratchBufferOffset:0]; + } + else { + [accelEnc buildAccelerationStructure:accel_uncompressed + descriptor:accelDesc + scratchBuffer:scratchBuf + scratchBufferOffset:0]; + } + if (use_fast_trace_bvh) { + [accelEnc writeCompactedAccelerationStructureSize:accel_uncompressed + toBuffer:sizeBuf + offset:0 + sizeDataType:MTLDataTypeULong]; + } + [accelEnc endEncoding]; + [accelCommands addCompletedHandler:^(id command_buffer) { + /* free temp resources */ + [scratchBuf release]; + [indexBuf release]; + [posBuf release]; + + if (use_fast_trace_bvh) { + /* Compact the accel structure */ + uint64_t compressed_size = *(uint64_t *)sizeBuf.contents; + + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ + id accelCommands = [queue commandBuffer]; + id accelEnc = + [accelCommands accelerationStructureCommandEncoder]; + id accel = [device + newAccelerationStructureWithSize:compressed_size]; + [accelEnc copyAndCompactAccelerationStructure:accel_uncompressed + toAccelerationStructure:accel]; + [accelEnc endEncoding]; + [accelCommands addCompletedHandler:^(id command_buffer) { + uint64_t allocated_size = [accel allocatedSize]; + stats.mem_alloc(allocated_size); + accel_struct = accel; + [accel_uncompressed release]; + accel_struct_building = false; + }]; + [accelCommands commit]; + }); + } + else { + /* set our acceleration structure to the uncompressed structure */ + accel_struct = accel_uncompressed; + + uint64_t allocated_size = [accel_struct allocatedSize]; + stats.mem_alloc(allocated_size); + accel_struct_building = false; + } + [sizeBuf release]; + }]; + + accel_struct_building = true; + [accelCommands commit]; + + return true; + } + return false; +} + +bool BVHMetal::build_BLAS_hair(Progress &progress, + id device, + id queue, + Geometry *const geom, + bool refit) +{ + if (@available(macos 12.0, *)) { + /* Build BLAS for hair curves */ + Hair *hair = static_cast(geom); + if (hair->num_curves() == 0) { + return false; + } + + /*------------------------------------------------*/ + BVH_status( + "Building hair BLAS | %7d curves | %s", (int)hair->num_curves(), geom->name.c_str()); + /*------------------------------------------------*/ + + const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC); + const size_t num_segments = hair->num_segments(); + + size_t num_motion_steps = 1; + Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if (motion_blur && hair->get_use_motion_blur() && motion_keys) { + num_motion_steps = hair->get_motion_steps(); + } + + const size_t num_aabbs = num_segments * num_motion_steps; + + MTLResourceOptions storage_mode; + if (device.hasUnifiedMemory) { + storage_mode = MTLResourceStorageModeShared; + } + else { + storage_mode = MTLResourceStorageModeManaged; + } + + /* Allocate a GPU buffer for the AABB data and populate it */ + id aabbBuf = [device + newBufferWithLength:num_aabbs * sizeof(MTLAxisAlignedBoundingBox) + options:storage_mode]; + MTLAxisAlignedBoundingBox *aabb_data = (MTLAxisAlignedBoundingBox *)[aabbBuf contents]; + + /* Get AABBs for each motion step */ + size_t center_step = (num_motion_steps - 1) / 2; + for (size_t step = 0; step < num_motion_steps; ++step) { + /* The center step for motion vertices is not stored in the attribute */ + const float3 *keys = hair->get_curve_keys().data(); + if (step != center_step) { + size_t attr_offset = (step > center_step) ? step - 1 : step; + /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4) */ + keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size(); + } + + for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) { + const Hair::Curve curve = hair->get_curve(j); + + for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) { + { + BoundBox bounds = BoundBox::empty; + curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds); + + const size_t index = step * num_segments + i; + aabb_data[index].min = (MTLPackedFloat3 &)bounds.min; + aabb_data[index].max = (MTLPackedFloat3 &)bounds.max; + } + } + } + } + + if (storage_mode == MTLResourceStorageModeManaged) { + [aabbBuf didModifyRange:NSMakeRange(0, aabbBuf.length)]; + } + +# if 0 + for (size_t i=0; i aabb_ptrs; + aabb_ptrs.reserve(num_motion_steps); + for (size_t step = 0; step < num_motion_steps; ++step) { + MTLMotionKeyframeData *k = [MTLMotionKeyframeData data]; + k.buffer = aabbBuf; + k.offset = step * num_segments * sizeof(MTLAxisAlignedBoundingBox); + aabb_ptrs.push_back(k); + } + + MTLAccelerationStructureMotionBoundingBoxGeometryDescriptor *geomDescMotion = + [MTLAccelerationStructureMotionBoundingBoxGeometryDescriptor descriptor]; + geomDescMotion.boundingBoxBuffers = [NSArray arrayWithObjects:aabb_ptrs.data() + count:aabb_ptrs.size()]; + geomDescMotion.boundingBoxCount = num_segments; + geomDescMotion.boundingBoxStride = sizeof(aabb_data[0]); + geomDescMotion.intersectionFunctionTableOffset = 1; + + /* Force a single any-hit call, so shadow record-all behavior works correctly */ + /* (Match optix behaviour: unsigned int build_flags = + * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */ + geomDescMotion.allowDuplicateIntersectionFunctionInvocation = false; + geomDescMotion.opaque = true; + geomDesc = geomDescMotion; + } + else { + MTLAccelerationStructureBoundingBoxGeometryDescriptor *geomDescNoMotion = + [MTLAccelerationStructureBoundingBoxGeometryDescriptor descriptor]; + geomDescNoMotion.boundingBoxBuffer = aabbBuf; + geomDescNoMotion.boundingBoxBufferOffset = 0; + geomDescNoMotion.boundingBoxCount = int(num_aabbs); + geomDescNoMotion.boundingBoxStride = sizeof(aabb_data[0]); + geomDescNoMotion.intersectionFunctionTableOffset = 1; + + /* Force a single any-hit call, so shadow record-all behavior works correctly */ + /* (Match optix behaviour: unsigned int build_flags = + * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */ + geomDescNoMotion.allowDuplicateIntersectionFunctionInvocation = false; + geomDescNoMotion.opaque = true; + geomDesc = geomDescNoMotion; + } + + MTLPrimitiveAccelerationStructureDescriptor *accelDesc = + [MTLPrimitiveAccelerationStructureDescriptor descriptor]; + accelDesc.geometryDescriptors = @[ geomDesc ]; + + if (motion_blur) { + accelDesc.motionStartTime = 0.0f; + accelDesc.motionEndTime = 1.0f; + accelDesc.motionStartBorderMode = MTLMotionBorderModeVanish; + accelDesc.motionEndBorderMode = MTLMotionBorderModeVanish; + accelDesc.motionKeyframeCount = num_motion_steps; + } + + if (!use_fast_trace_bvh) { + accelDesc.usage |= (MTLAccelerationStructureUsageRefit | + MTLAccelerationStructureUsagePreferFastBuild); + } + + MTLAccelerationStructureSizes accelSizes = [device + accelerationStructureSizesWithDescriptor:accelDesc]; + id accel_uncompressed = [device + newAccelerationStructureWithSize:accelSizes.accelerationStructureSize]; + id scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize + options:MTLResourceStorageModePrivate]; + id sizeBuf = [device newBufferWithLength:8 options:MTLResourceStorageModeShared]; + id accelCommands = [queue commandBuffer]; + id accelEnc = + [accelCommands accelerationStructureCommandEncoder]; + if (refit) { + [accelEnc refitAccelerationStructure:accel_struct + descriptor:accelDesc + destination:accel_uncompressed + scratchBuffer:scratchBuf + scratchBufferOffset:0]; + } + else { + [accelEnc buildAccelerationStructure:accel_uncompressed + descriptor:accelDesc + scratchBuffer:scratchBuf + scratchBufferOffset:0]; + } + if (use_fast_trace_bvh) { + [accelEnc writeCompactedAccelerationStructureSize:accel_uncompressed + toBuffer:sizeBuf + offset:0 + sizeDataType:MTLDataTypeULong]; + } + [accelEnc endEncoding]; + [accelCommands addCompletedHandler:^(id command_buffer) { + /* free temp resources */ + [scratchBuf release]; + [aabbBuf release]; + + if (use_fast_trace_bvh) { + /* Compact the accel structure */ + uint64_t compressed_size = *(uint64_t *)sizeBuf.contents; + + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ + id accelCommands = [queue commandBuffer]; + id accelEnc = + [accelCommands accelerationStructureCommandEncoder]; + id accel = [device + newAccelerationStructureWithSize:compressed_size]; + [accelEnc copyAndCompactAccelerationStructure:accel_uncompressed + toAccelerationStructure:accel]; + [accelEnc endEncoding]; + [accelCommands addCompletedHandler:^(id command_buffer) { + uint64_t allocated_size = [accel allocatedSize]; + stats.mem_alloc(allocated_size); + accel_struct = accel; + [accel_uncompressed release]; + accel_struct_building = false; + }]; + [accelCommands commit]; + }); + } + else { + /* set our acceleration structure to the uncompressed structure */ + accel_struct = accel_uncompressed; + + uint64_t allocated_size = [accel_struct allocatedSize]; + stats.mem_alloc(allocated_size); + accel_struct_building = false; + } + [sizeBuf release]; + }]; + + accel_struct_building = true; + [accelCommands commit]; + return true; + } + return false; +} + +bool BVHMetal::build_BLAS(Progress &progress, + id device, + id queue, + bool refit) +{ + if (@available(macos 12.0, *)) { + assert(objects.size() == 1 && geometry.size() == 1); + + /* Build bottom level acceleration structures (BLAS) */ + Geometry *const geom = geometry[0]; + switch (geom->geometry_type) { + case Geometry::VOLUME: + case Geometry::MESH: + return build_BLAS_mesh(progress, device, queue, geom, refit); + case Geometry::HAIR: + return build_BLAS_hair(progress, device, queue, geom, refit); + default: + return false; + } + } + return false; +} + +bool BVHMetal::build_TLAS(Progress &progress, + id device, + id queue, + bool refit) +{ + if (@available(macos 12.0, *)) { + + /* we need to sync here and ensure that all BLAS have completed async generation by both GCD + * and Metal */ + { + __block bool complete_bvh = false; + while (!complete_bvh) { + dispatch_sync(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ + complete_bvh = true; + for (Object *ob : objects) { + /* Skip non-traceable objects */ + if (!ob->is_traceable()) + continue; + + Geometry const *geom = ob->get_geometry(); + BVHMetal const *blas = static_cast(geom->bvh); + if (blas->accel_struct_building) { + complete_bvh = false; + + /* We're likely waiting on a command buffer that's in flight to complete. + * Queue up a command buffer and wait for it complete before checking the BLAS again + */ + id command_buffer = [queue commandBuffer]; + [command_buffer commit]; + [command_buffer waitUntilCompleted]; + break; + } + } + }); + } + } + + uint32_t num_instances = 0; + uint32_t num_motion_transforms = 0; + for (Object *ob : objects) { + /* Skip non-traceable objects */ + if (!ob->is_traceable()) + continue; + num_instances++; + + if (ob->use_motion()) { + num_motion_transforms += max(1, ob->get_motion().size()); + } + else { + num_motion_transforms++; + } + } + + /*------------------------------------------------*/ + BVH_status("Building TLAS | %7d instances", (int)num_instances); + /*------------------------------------------------*/ + + const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC); + + NSMutableArray *all_blas = [NSMutableArray array]; + unordered_map instance_mapping; + + /* Lambda function to build/retrieve the BLAS index mapping */ + auto get_blas_index = [&](BVHMetal const *blas) { + auto it = instance_mapping.find(blas); + if (it != instance_mapping.end()) { + return it->second; + } + else { + int blas_index = (int)[all_blas count]; + instance_mapping[blas] = blas_index; + if (@available(macos 12.0, *)) { + [all_blas addObject:blas->accel_struct]; + } + return blas_index; + } + }; + + MTLResourceOptions storage_mode; + if (device.hasUnifiedMemory) { + storage_mode = MTLResourceStorageModeShared; + } + else { + storage_mode = MTLResourceStorageModeManaged; + } + + size_t instance_size; + if (motion_blur) { + instance_size = sizeof(MTLAccelerationStructureMotionInstanceDescriptor); + } + else { + instance_size = sizeof(MTLAccelerationStructureUserIDInstanceDescriptor); + } + + /* Allocate a GPU buffer for the instance data and populate it */ + id instanceBuf = [device newBufferWithLength:num_instances * instance_size + options:storage_mode]; + id motion_transforms_buf = nil; + MTLPackedFloat4x3 *motion_transforms = nullptr; + if (motion_blur && num_motion_transforms) { + motion_transforms_buf = [device + newBufferWithLength:num_motion_transforms * sizeof(MTLPackedFloat4x3) + options:storage_mode]; + motion_transforms = (MTLPackedFloat4x3 *)motion_transforms_buf.contents; + } + + uint32_t instance_index = 0; + uint32_t motion_transform_index = 0; + for (Object *ob : objects) { + /* Skip non-traceable objects */ + if (!ob->is_traceable()) + continue; + + Geometry const *geom = ob->get_geometry(); + + BVHMetal const *blas = static_cast(geom->bvh); + uint32_t accel_struct_index = get_blas_index(blas); + + /* Add some of the object visibility bits to the mask. + * __prim_visibility contains the combined visibility bits of all instances, so is not + * reliable if they differ between instances. + * + * METAL_WIP: OptiX visibility mask can only contain 8 bits, so have to trade-off here + * and select just a few important ones. + */ + uint32_t mask = ob->visibility_for_tracing() & 0xFF; + + /* Have to have at least one bit in the mask, or else instance would always be culled. */ + if (0 == mask) { + mask = 0xFF; + } + + /* Set user instance ID to object index */ + int object_index = ob->get_device_index(); + uint32_t user_id = uint32_t(object_index); + + /* Bake into the appropriate descriptor */ + if (motion_blur) { + MTLAccelerationStructureMotionInstanceDescriptor *instances = + (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents]; + MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++]; + + desc.accelerationStructureIndex = accel_struct_index; + desc.userID = user_id; + desc.mask = mask; + desc.motionStartTime = 0.0f; + desc.motionEndTime = 1.0f; + desc.motionTransformsStartIndex = motion_transform_index; + desc.motionStartBorderMode = MTLMotionBorderModeVanish; + desc.motionEndBorderMode = MTLMotionBorderModeVanish; + desc.intersectionFunctionTableOffset = 0; + + int key_count = ob->get_motion().size(); + if (key_count) { + desc.motionTransformsCount = key_count; + + Transform *keys = ob->get_motion().data(); + for (int i = 0; i < key_count; i++) { + float *t = (float *)&motion_transforms[motion_transform_index++]; + /* Transpose transform */ + auto src = (float const *)&keys[i]; + for (int i = 0; i < 12; i++) { + t[i] = src[(i / 3) + 4 * (i % 3)]; + } + } + } + else { + desc.motionTransformsCount = 1; + + float *t = (float *)&motion_transforms[motion_transform_index++]; + if (ob->get_geometry()->is_instanced()) { + /* Transpose transform */ + auto src = (float const *)&ob->get_tfm(); + for (int i = 0; i < 12; i++) { + t[i] = src[(i / 3) + 4 * (i % 3)]; + } + } + else { + /* Clear transform to identity matrix */ + t[0] = t[4] = t[8] = 1.0f; + } + } + } + else { + MTLAccelerationStructureUserIDInstanceDescriptor *instances = + (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents]; + MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++]; + + desc.accelerationStructureIndex = accel_struct_index; + desc.userID = user_id; + desc.mask = mask; + desc.intersectionFunctionTableOffset = 0; + + float *t = (float *)&desc.transformationMatrix; + if (ob->get_geometry()->is_instanced()) { + /* Transpose transform */ + auto src = (float const *)&ob->get_tfm(); + for (int i = 0; i < 12; i++) { + t[i] = src[(i / 3) + 4 * (i % 3)]; + } + } + else { + /* Clear transform to identity matrix */ + t[0] = t[4] = t[8] = 1.0f; + } + } + } + + if (storage_mode == MTLResourceStorageModeManaged) { + [instanceBuf didModifyRange:NSMakeRange(0, instanceBuf.length)]; + if (motion_transforms_buf) { + [motion_transforms_buf didModifyRange:NSMakeRange(0, motion_transforms_buf.length)]; + assert(num_motion_transforms == motion_transform_index); + } + } + + MTLInstanceAccelerationStructureDescriptor *accelDesc = + [MTLInstanceAccelerationStructureDescriptor descriptor]; + accelDesc.instanceCount = num_instances; + accelDesc.instanceDescriptorType = MTLAccelerationStructureInstanceDescriptorTypeUserID; + accelDesc.instanceDescriptorBuffer = instanceBuf; + accelDesc.instanceDescriptorBufferOffset = 0; + accelDesc.instanceDescriptorStride = instance_size; + accelDesc.instancedAccelerationStructures = all_blas; + + if (motion_blur) { + accelDesc.instanceDescriptorType = MTLAccelerationStructureInstanceDescriptorTypeMotion; + accelDesc.motionTransformBuffer = motion_transforms_buf; + accelDesc.motionTransformCount = num_motion_transforms; + } + + if (!use_fast_trace_bvh) { + accelDesc.usage |= (MTLAccelerationStructureUsageRefit | + MTLAccelerationStructureUsagePreferFastBuild); + } + + MTLAccelerationStructureSizes accelSizes = [device + accelerationStructureSizesWithDescriptor:accelDesc]; + id accel = [device + newAccelerationStructureWithSize:accelSizes.accelerationStructureSize]; + id scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize + options:MTLResourceStorageModePrivate]; + id accelCommands = [queue commandBuffer]; + id accelEnc = + [accelCommands accelerationStructureCommandEncoder]; + if (refit) { + [accelEnc refitAccelerationStructure:accel_struct + descriptor:accelDesc + destination:accel + scratchBuffer:scratchBuf + scratchBufferOffset:0]; + } + else { + [accelEnc buildAccelerationStructure:accel + descriptor:accelDesc + scratchBuffer:scratchBuf + scratchBufferOffset:0]; + } + [accelEnc endEncoding]; + [accelCommands commit]; + [accelCommands waitUntilCompleted]; + + if (motion_transforms_buf) { + [motion_transforms_buf release]; + } + [instanceBuf release]; + [scratchBuf release]; + + uint64_t allocated_size = [accel allocatedSize]; + stats.mem_alloc(allocated_size); + + /* Cache top and bottom-level acceleration structs */ + accel_struct = accel; + blas_array.clear(); + blas_array.reserve(all_blas.count); + for (id blas in all_blas) { + blas_array.push_back(blas); + } + + return true; + } + return false; +} + +bool BVHMetal::build(Progress &progress, + id device, + id queue, + bool refit) +{ + if (@available(macos 12.0, *)) { + if (refit && params.bvh_type != BVH_TYPE_STATIC) { + assert(accel_struct); + } + else { + if (accel_struct) { + stats.mem_free(accel_struct.allocatedSize); + [accel_struct release]; + accel_struct = nil; + } + } + } + + if (!params.top_level) { + return build_BLAS(progress, device, queue, refit); + } + else { + return build_TLAS(progress, device, queue, refit); + } +} + +CCL_NAMESPACE_END + +#endif /* WITH_METAL */ diff --git a/intern/cycles/device/metal/device.h b/intern/cycles/device/metal/device.h new file mode 100644 index 00000000000..254fbfee42b --- /dev/null +++ b/intern/cycles/device/metal/device.h @@ -0,0 +1,37 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/string.h" +#include "util/vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_metal_init(); + +Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_metal_info(vector &devices); + +string device_metal_capabilities(); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/device.mm b/intern/cycles/device/metal/device.mm new file mode 100644 index 00000000000..bc893adea17 --- /dev/null +++ b/intern/cycles/device/metal/device.mm @@ -0,0 +1,136 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_METAL + +# include "device/metal/device.h" +# include "device/metal/device_impl.h" + +#endif + +#include "util/debug.h" +#include "util/set.h" +#include "util/system.h" + +CCL_NAMESPACE_BEGIN + +#ifdef WITH_METAL + +Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ + return new MetalDevice(info, stats, profiler); +} + +bool device_metal_init() +{ + return true; +} + +static int device_metal_get_num_devices_safe(uint32_t *num_devices) +{ + *num_devices = MTLCopyAllDevices().count; + return 0; +} + +void device_metal_info(vector &devices) +{ + uint32_t num_devices = 0; + device_metal_get_num_devices_safe(&num_devices); + if (num_devices == 0) { + return; + } + + vector usable_devices; + MetalInfo::get_usable_devices(&usable_devices); + /* Devices are numbered consecutively across platforms. */ + set unique_ids; + int device_index = 0; + for (MetalPlatformDevice &device : usable_devices) { + /* Compute unique ID for persistent user preferences. */ + const string &device_name = device.device_name; + string id = string("METAL_") + device_name; + + /* Hardware ID might not be unique, add device number in that case. */ + if (unique_ids.find(id) != unique_ids.end()) { + id += string_printf("_ID_%d", num_devices); + } + unique_ids.insert(id); + + /* Create DeviceInfo. */ + DeviceInfo info; + info.type = DEVICE_METAL; + info.description = string_remove_trademark(string(device_name)); + + /* Ensure unique naming on Apple Silicon / SoC devices which return the same string for CPU and + * GPU */ + if (info.description == system_cpu_brand_string()) { + info.description += " (GPU)"; + } + + info.num = device_index; + /* We don't know if it's used for display, but assume it is. */ + info.display_device = true; + info.denoisers = DENOISER_NONE; + info.id = id; + + devices.push_back(info); + device_index++; + } +} + +string device_metal_capabilities() +{ + string result = ""; + string error_msg = ""; + uint32_t num_devices = 0; + assert(device_metal_get_num_devices_safe(&num_devices)); + if (num_devices == 0) { + return "No Metal devices found\n"; + } + result += string_printf("Number of devices: %u\n", num_devices); + + NSArray> *allDevices = MTLCopyAllDevices(); + for (id device in allDevices) { + result += string_printf("\t\tDevice: %s\n", [device.name UTF8String]); + } + + return result; +} + +#else + +Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ + return nullptr; +} + +bool device_metal_init() +{ + return false; +} + +void device_metal_info(vector &devices) +{ +} + +string device_metal_capabilities() +{ + return ""; +} + +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h new file mode 100644 index 00000000000..a420a3ba704 --- /dev/null +++ b/intern/cycles/device/metal/device_impl.h @@ -0,0 +1,166 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_METAL + +# include "bvh/bvh.h" +# include "device/device.h" +# include "device/metal/bvh.h" +# include "device/metal/device.h" +# include "device/metal/kernel.h" +# include "device/metal/queue.h" +# include "device/metal/util.h" + +# include + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +class MetalDevice : public Device { + public: + id mtlDevice = nil; + id mtlLibrary[PSO_NUM] = {nil}; + id mtlBufferKernelParamsEncoder = + nil; /* encoder used for fetching device pointers from MTLBuffers */ + id mtlGeneralCommandQueue = nil; + id mtlAncillaryArgEncoder = + nil; /* encoder used for fetching device pointers from MTLBuffers */ + string source_used_for_compile[PSO_NUM]; + + KernelParamsMetal launch_params = {0}; + + /* MetalRT members ----------------------------------*/ + BVHMetal *bvhMetalRT = nullptr; + bool motion_blur = false; + id mtlASArgEncoder = + nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */ + /*---------------------------------------------------*/ + + string device_name; + MetalGPUVendor device_vendor; + + uint kernel_features; + MTLResourceOptions default_storage_mode; + int max_threads_per_threadgroup; + + int mtlDevId = 0; + bool first_error = true; + + struct MetalMem { + device_memory *mem = nullptr; + int pointer_index = -1; + id mtlBuffer = nil; + id mtlTexture = nil; + uint64_t offset = 0; + uint64_t size = 0; + void *hostPtr = nullptr; + bool use_UMA = false; /* If true, UMA memory in shared_pointer is being used. */ + }; + typedef map> MetalMemMap; + MetalMemMap metal_mem_map; + std::vector> delayed_free_list; + std::recursive_mutex metal_mem_map_mutex; + + /* Bindless Textures */ + device_vector texture_info; + bool need_texture_info; + id mtlTextureArgEncoder = nil; + id texture_bindings_2d = nil; + id texture_bindings_3d = nil; + std::vector> texture_slot_map; + + MetalDeviceKernels kernels; + bool use_metalrt = false; + bool use_function_specialisation = false; + + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + void set_error(const string &error) override; + + MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); + + virtual ~MetalDevice(); + + bool support_device(const uint /*kernel_features*/); + + bool check_peer_access(Device *peer_device) override; + + bool use_adaptive_compilation(); + + string get_source(const uint kernel_features); + + string compile_kernel(const uint kernel_features, const char *name); + + virtual bool load_kernels(const uint kernel_features) override; + + void reserve_local_memory(const uint kernel_features); + + void init_host_memory(); + + void load_texture_info(); + + virtual bool should_use_graphics_interop() override; + + virtual unique_ptr gpu_queue_create() override; + + virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; + + /* ------------------------------------------------------------------ */ + /* low-level memory management */ + + MetalMem *generic_alloc(device_memory &mem); + + void generic_copy_to(device_memory &mem); + + void generic_free(device_memory &mem); + + void mem_alloc(device_memory &mem) override; + + void mem_copy_to(device_memory &mem) override; + + void mem_copy_from(device_memory &mem) + { + mem_copy_from(mem, -1, -1, -1, -1); + } + void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override; + + void mem_zero(device_memory &mem) override; + + void mem_free(device_memory &mem) override; + + device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + + void tex_alloc_as_buffer(device_texture &mem); + + void tex_free(device_texture &mem); + + void flush_delayed_free_list(); +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm new file mode 100644 index 00000000000..69a08ca07d1 --- /dev/null +++ b/intern/cycles/device/metal/device_impl.mm @@ -0,0 +1,1008 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_METAL + +# include "device/metal/device_impl.h" +# include "device/metal/device.h" + +# include "util/debug.h" +# include "util/md5.h" +# include "util/path.h" + +CCL_NAMESPACE_BEGIN + +class MetalDevice; + +BVHLayoutMask MetalDevice::get_bvh_layout_mask() const +{ + return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2; +} + +void MetalDevice::set_error(const string &error) +{ + static std::mutex s_error_mutex; + std::lock_guard lock(s_error_mutex); + + Device::set_error(error); + + if (first_error) { + fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } +} + +MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + mtlDevId = info.num; + + /* select chosen device */ + vector usable_devices; + MetalInfo::get_usable_devices(&usable_devices); + if (usable_devices.size() == 0) { + set_error("Metal: no devices found."); + return; + } + assert(mtlDevId < usable_devices.size()); + MetalPlatformDevice &platform_device = usable_devices[mtlDevId]; + mtlDevice = platform_device.device_id; + device_name = platform_device.device_name; + device_vendor = MetalInfo::get_vendor_from_device_name(device_name); + assert(device_vendor != METAL_GPU_UNKNOWN); + metal_printf("Creating new Cycles device for Metal: %s\n", device_name.c_str()); + + /* determine default storage mode based on whether UMA is supported */ + + default_storage_mode = MTLResourceStorageModeManaged; + + if (@available(macos 11.0, *)) { + if ([mtlDevice hasUnifiedMemory]) { + default_storage_mode = MTLResourceStorageModeShared; + init_host_memory(); + } + } + + texture_bindings_2d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode]; + texture_bindings_3d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode]; + + stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize); + + switch (device_vendor) { + default: + break; + case METAL_GPU_INTEL: { + use_metalrt = false; + max_threads_per_threadgroup = 64; + break; + } + case METAL_GPU_AMD: { + use_metalrt = false; + max_threads_per_threadgroup = 128; + break; + } + case METAL_GPU_APPLE: { + use_metalrt = true; + max_threads_per_threadgroup = 512; + break; + } + } + + if (auto metalrt = getenv("CYCLES_METALRT")) { + use_metalrt = (atoi(metalrt) != 0); + } + + MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init]; + arg_desc_params.dataType = MTLDataTypePointer; + arg_desc_params.access = MTLArgumentAccessReadOnly; + arg_desc_params.arrayLength = sizeof(KernelParamsMetal) / sizeof(device_ptr); + mtlBufferKernelParamsEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_params ]]; + + MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc] init]; + arg_desc_texture.dataType = MTLDataTypeTexture; + arg_desc_texture.access = MTLArgumentAccessReadOnly; + mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]]; + + /* command queue for non-tracing work on the GPU */ + mtlGeneralCommandQueue = [mtlDevice newCommandQueue]; + + /* Acceleration structure arg encoder, if needed */ + if (@available(macos 12.0, *)) { + if (use_metalrt) { + MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init]; + arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure; + arg_desc_as.access = MTLArgumentAccessReadOnly; + mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]]; + [arg_desc_as release]; + } + } + + /* Build the arg encoder for the ancillary bindings */ + { + NSMutableArray *ancillary_desc = [[NSMutableArray alloc] init]; + + int index = 0; + MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc] init]; + arg_desc_tex.dataType = MTLDataTypePointer; + arg_desc_tex.access = MTLArgumentAccessReadOnly; + + arg_desc_tex.index = index++; + [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */ + arg_desc_tex.index = index++; + [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_3d */ + + [arg_desc_tex release]; + + if (@available(macos 12.0, *)) { + if (use_metalrt) { + MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init]; + arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure; + arg_desc_as.access = MTLArgumentAccessReadOnly; + + MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init]; + arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable; + arg_desc_ift.access = MTLArgumentAccessReadOnly; + + arg_desc_as.index = index++; + [ancillary_desc addObject:[arg_desc_as copy]]; /* accel_struct */ + arg_desc_ift.index = index++; + [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_default */ + arg_desc_ift.index = index++; + [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */ + arg_desc_ift.index = index++; + [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */ + + [arg_desc_ift release]; + [arg_desc_as release]; + } + } + + mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc]; + + for (int i = 0; i < ancillary_desc.count; i++) { + [ancillary_desc[i] release]; + } + [ancillary_desc release]; + } + [arg_desc_params release]; + [arg_desc_texture release]; +} + +MetalDevice::~MetalDevice() +{ + for (auto &tex : texture_slot_map) { + if (tex) { + [tex release]; + tex = nil; + } + } + flush_delayed_free_list(); + + if (texture_bindings_2d) { + stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize); + + [texture_bindings_2d release]; + [texture_bindings_3d release]; + } + [mtlTextureArgEncoder release]; + [mtlBufferKernelParamsEncoder release]; + [mtlASArgEncoder release]; + [mtlAncillaryArgEncoder release]; + [mtlGeneralCommandQueue release]; + [mtlDevice release]; + + texture_info.free(); +} + +bool MetalDevice::support_device(const uint kernel_features /*requested_features*/) +{ + return true; +} + +bool MetalDevice::check_peer_access(Device *peer_device) +{ + assert(0); + /* does peer access make sense? */ + return false; +} + +bool MetalDevice::use_adaptive_compilation() +{ + return DebugFlags().metal.adaptive_compile; +} + +string MetalDevice::get_source(const uint kernel_features) +{ + string build_options; + + if (use_adaptive_compilation()) { + build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + } + + if (use_metalrt) { + build_options += "-D__METALRT__ "; + if (motion_blur) { + build_options += "-D__METALRT_MOTION__ "; + } + } + +# ifdef WITH_CYCLES_DEBUG + build_options += "-D__KERNEL_DEBUG__ "; +# endif + + switch (device_vendor) { + default: + break; + case METAL_GPU_INTEL: + build_options += "-D__KERNEL_METAL_INTEL__ "; + break; + case METAL_GPU_AMD: + build_options += "-D__KERNEL_METAL_AMD__ "; + break; + case METAL_GPU_APPLE: + build_options += "-D__KERNEL_METAL_APPLE__ "; + break; + } + + /* reformat -D defines list into compilable form */ + vector components; + string_replace(build_options, "-D", ""); + string_split(components, build_options, " "); + + string globalDefines; + for (const string &component : components) { + vector assignments; + string_split(assignments, component, "="); + if (assignments.size() == 2) + globalDefines += string_printf( + "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str()); + else + globalDefines += string_printf("#define %s\n", assignments[0].c_str()); + } + + string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n"; + source = path_source_replace_includes(source, path_get("source")); + + metal_printf("Global defines:\n%s\n", globalDefines.c_str()); + + return source; +} + +bool MetalDevice::load_kernels(const uint _kernel_features) +{ + kernel_features = _kernel_features; + + /* check if GPU is supported */ + if (!support_device(kernel_features)) + return false; + + /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds + * This is necessary since objects may be reported to have motion if the Vector pass is + * active, but may still need to be rendered without motion blur if that isn't active as well. */ + motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; + + NSError *error = NULL; + + for (int i = 0; i < PSO_NUM; i++) { + if (mtlLibrary[i]) { + [mtlLibrary[i] release]; + mtlLibrary[i] = nil; + } + } + + MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; + + options.fastMathEnabled = YES; + options.languageVersion = MTLLanguageVersion2_1; + + if (@available(macOS 11.0, *)) { + options.languageVersion = MTLLanguageVersion2_3; + } + if (@available(macOS 12.0, *)) { + options.languageVersion = MTLLanguageVersion2_4; + } + + string metalsrc; + + /* local helper: dump source to disk and return filepath */ + auto dump_source = [&](int kernel_type) -> string { + string &source = source_used_for_compile[kernel_type]; + string metalsrc = path_cache_get(path_join("kernels", + string_printf("%s.%s.metal", + kernel_type_as_string(kernel_type), + util_md5_string(source).c_str()))); + path_write_text(metalsrc, source); + return metalsrc; + }; + + /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor, + * then compile it into a MTLLibrary */ + auto fetch_and_compile_source = [&](int kernel_type) { + /* record the source uesd to compile this library, for hash building later */ + string &source = source_used_for_compile[kernel_type]; + + switch (kernel_type) { + case PSO_GENERIC: { + source = get_source(kernel_features); + break; + } + case PSO_SPECIALISED: { + /* PSO_SPECIALISED derives from PSO_GENERIC */ + string &generic_source = source_used_for_compile[PSO_GENERIC]; + if (generic_source.empty()) { + generic_source = get_source(kernel_features); + } + source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source; + break; + } + default: + assert(0); + } + + /* create MTLLibrary (front-end compilation) */ + mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str()) + options:options + error:&error]; + + bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr); + + if (!mtlLibrary[kernel_type] || do_source_dump) { + string metalsrc = dump_source(kernel_type); + + if (!mtlLibrary[kernel_type]) { + NSString *err = [error localizedDescription]; + set_error(string_printf("Failed to compile library:\n%s", [err UTF8String])); + + return false; + } + } + return true; + }; + + fetch_and_compile_source(PSO_GENERIC); + + if (use_function_specialisation) { + fetch_and_compile_source(PSO_SPECIALISED); + } + + metal_printf("Front-end compilation finished\n"); + + bool result = kernels.load(this, PSO_GENERIC); + + [options release]; + reserve_local_memory(kernel_features); + + return result; +} + +void MetalDevice::reserve_local_memory(const uint kernel_features) +{ + /* METAL_WIP - implement this */ +} + +void MetalDevice::init_host_memory() +{ + /* METAL_WIP - implement this */ +} + +void MetalDevice::load_texture_info() +{ + if (need_texture_info) { + /* Unset flag before copying. */ + need_texture_info = false; + texture_info.copy_to_device(); + + int num_textures = texture_info.size(); + + for (int tex = 0; tex < num_textures; tex++) { + uint64_t offset = tex * sizeof(void *); + + id metal_texture = texture_slot_map[tex]; + if (!metal_texture) { + [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset]; + [mtlTextureArgEncoder setTexture:nil atIndex:0]; + [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset]; + [mtlTextureArgEncoder setTexture:nil atIndex:0]; + } + else { + MTLTextureType type = metal_texture.textureType; + [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset]; + [mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0]; + [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset]; + [mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0]; + } + } + if (default_storage_mode == MTLResourceStorageModeManaged) { + [texture_bindings_2d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))]; + [texture_bindings_3d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))]; + } + } +} + +MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) +{ + size_t size = mem.memory_size(); + + mem.device_pointer = 0; + + id metal_buffer = nil; + if (size > 0) { + MTLResourceOptions options = default_storage_mode; + if (mem.type == MEM_DEVICE_ONLY) { + options = MTLResourceStorageModePrivate; + } + + metal_buffer = [mtlDevice newBufferWithLength:size options:options]; + + if (!metal_buffer) { + set_error("System is out of GPU memory"); + return nullptr; + } + } + + if (mem.name) { + VLOG(2) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + mem.device_size = metal_buffer.allocatedSize; + stats.mem_alloc(mem.device_size); + + metal_buffer.label = [[NSString alloc] initWithFormat:@"%s", mem.name]; + + std::lock_guard lock(metal_mem_map_mutex); + + assert(metal_mem_map.count(&mem) == 0); /* assert against double-alloc */ + MetalMem *mmem = new MetalMem; + metal_mem_map[&mem] = std::unique_ptr(mmem); + + mmem->mem = &mem; + mmem->mtlBuffer = metal_buffer; + mmem->offset = 0; + mmem->size = size; + if (mem.type != MEM_DEVICE_ONLY) { + mmem->hostPtr = [metal_buffer contents]; + } + else { + mmem->hostPtr = nullptr; + } + + /* encode device_pointer as (MetalMem*) in order to handle resource relocation and device pointer + * recalculation */ + mem.device_pointer = device_ptr(mmem); + + if (metal_buffer.storageMode == MTLResourceStorageModeShared) { + /* Replace host pointer with our host allocation. */ + + if (mem.host_pointer && mem.host_pointer != mmem->hostPtr) { + memcpy(mmem->hostPtr, mem.host_pointer, size); + + mem.host_free(); + mem.host_pointer = mmem->hostPtr; + } + mem.shared_pointer = mmem->hostPtr; + mem.shared_counter++; + mmem->use_UMA = true; + } + else { + mmem->use_UMA = false; + } + + return mmem; +} + +void MetalDevice::generic_copy_to(device_memory &mem) +{ + if (!mem.host_pointer || !mem.device_pointer) { + return; + } + + std::lock_guard lock(metal_mem_map_mutex); + if (!metal_mem_map.at(&mem)->use_UMA || mem.host_pointer != mem.shared_pointer) { + MetalMem &mmem = *metal_mem_map.at(&mem); + memcpy(mmem.hostPtr, mem.host_pointer, mem.memory_size()); + if (mmem.mtlBuffer.storageMode == MTLStorageModeManaged) { + [mmem.mtlBuffer didModifyRange:NSMakeRange(0, mem.memory_size())]; + } + } +} + +void MetalDevice::generic_free(device_memory &mem) +{ + if (mem.device_pointer) { + std::lock_guard lock(metal_mem_map_mutex); + MetalMem &mmem = *metal_mem_map.at(&mem); + size_t size = mmem.size; + + /* If mmem.use_uma is true, reference counting is used + * to safely free memory. */ + + bool free_mtlBuffer = false; + + if (mmem.use_UMA) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + free_mtlBuffer = true; + } + } + } + else { + free_mtlBuffer = true; + } + + if (free_mtlBuffer) { + if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) { + /* Safely move the device-side data back to the host before it is freed. */ + mem.host_pointer = mem.host_alloc(size); + memcpy(mem.host_pointer, mem.shared_pointer, size); + mmem.use_UMA = false; + } + + mem.shared_pointer = 0; + + /* Free device memory. */ + delayed_free_list.push_back(mmem.mtlBuffer); + mmem.mtlBuffer = nil; + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + metal_mem_map.erase(&mem); + } +} + +void MetalDevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + generic_alloc(mem); + } + else { + generic_alloc(mem); + } +} + +void MetalDevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + generic_copy_to(mem); + } +} + +void MetalDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) +{ + if (mem.host_pointer) { + + bool subcopy = (w >= 0 && h >= 0); + const size_t size = subcopy ? (elem * w * h) : mem.memory_size(); + const size_t offset = subcopy ? (elem * y * w) : 0; + + if (mem.device_pointer) { + std::lock_guard lock(metal_mem_map_mutex); + MetalMem &mmem = *metal_mem_map.at(&mem); + + if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) { + + id cmdBuffer = [mtlGeneralCommandQueue commandBuffer]; + id blitEncoder = [cmdBuffer blitCommandEncoder]; + [blitEncoder synchronizeResource:mmem.mtlBuffer]; + [blitEncoder endEncoding]; + [cmdBuffer commit]; + [cmdBuffer waitUntilCompleted]; + } + + if (mem.host_pointer != mmem.hostPtr) { + memcpy((uchar *)mem.host_pointer + offset, (uchar *)mmem.hostPtr + offset, size); + } + } + else { + memset((char *)mem.host_pointer + offset, 0, size); + } + } +} + +void MetalDevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + if (!mem.device_pointer) { + return; + } + + size_t size = mem.memory_size(); + std::lock_guard lock(metal_mem_map_mutex); + MetalMem &mmem = *metal_mem_map.at(&mem); + memset(mmem.hostPtr, 0, size); + if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) { + [mmem.mtlBuffer didModifyRange:NSMakeRange(0, size)]; + } +} + +void MetalDevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else { + generic_free(mem); + } +} + +device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) +{ + /* METAL_WIP - revive if necessary */ + assert(0); + return 0; +} + +void MetalDevice::const_copy_to(const char *name, void *host, size_t size) +{ + if (strcmp(name, "__data") == 0) { + assert(size == sizeof(KernelData)); + memcpy((uint8_t *)&launch_params + offsetof(KernelParamsMetal, data), host, size); + return; + } + + auto update_launch_pointers = + [&](size_t offset, void *data, size_t data_size, size_t pointers_size) { + memcpy((uint8_t *)&launch_params + offset, data, data_size); + + MetalMem **mmem = (MetalMem **)data; + int pointer_count = pointers_size / sizeof(device_ptr); + int pointer_index = offset / sizeof(device_ptr); + for (int i = 0; i < pointer_count; i++) { + if (mmem[i]) { + mmem[i]->pointer_index = pointer_index + i; + } + } + }; + + /* Update data storage pointers in launch parameters. */ + if (strcmp(name, "__integrator_state") == 0) { + /* IntegratorStateGPU is contiguous pointers */ + const size_t pointer_block_size = sizeof(IntegratorStateGPU); + update_launch_pointers( + offsetof(KernelParamsMetal, __integrator_state), host, size, pointer_block_size); + } +# define KERNEL_TEX(data_type, tex_name) \ + else if (strcmp(name, #tex_name) == 0) \ + { \ + update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \ + } +# include "kernel/textures.h" +# undef KERNEL_TEX +} + +void MetalDevice::global_alloc(device_memory &mem) +{ + if (mem.is_resident(this)) { + generic_alloc(mem); + generic_copy_to(mem); + } + + const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); +} + +void MetalDevice::global_free(device_memory &mem) +{ + if (mem.is_resident(this) && mem.device_pointer) { + generic_free(mem); + } +} + +void MetalDevice::tex_alloc_as_buffer(device_texture &mem) +{ + generic_alloc(mem); + generic_copy_to(mem); + + /* Resize once */ + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(round_up(slot + 1, 128)); + } + + mem.info.data = (uint64_t)mem.device_pointer; + + /* Set Mapping and tag that we need to (re-)upload to device */ + texture_info[slot] = mem.info; + need_texture_info = true; +} + +void MetalDevice::tex_alloc(device_texture &mem) +{ + MTLStorageMode storage_mode = MTLStorageModeManaged; + if (@available(macos 10.15, *)) { + if ([mtlDevice hasUnifiedMemory] && + device_vendor != + METAL_GPU_INTEL) { /* Intel GPUs don't support MTLStorageModeShared for MTLTextures */ + storage_mode = MTLStorageModeShared; + } + } + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + /* sampler_index maps into the GPU's constant 'metal_samplers' array */ + uint64_t sampler_index = mem.info.extension; + if (mem.info.interpolation != INTERPOLATION_CLOSEST) { + sampler_index += 3; + } + + /* Image Texture Storage */ + MTLPixelFormat format; + switch (mem.data_type) { + case TYPE_UCHAR: { + MTLPixelFormat formats[] = {MTLPixelFormatR8Unorm, + MTLPixelFormatRG8Unorm, + MTLPixelFormatInvalid, + MTLPixelFormatRGBA8Unorm}; + format = formats[mem.data_elements - 1]; + } break; + case TYPE_UINT16: { + MTLPixelFormat formats[] = {MTLPixelFormatR16Unorm, + MTLPixelFormatRG16Unorm, + MTLPixelFormatInvalid, + MTLPixelFormatRGBA16Unorm}; + format = formats[mem.data_elements - 1]; + } break; + case TYPE_UINT: { + MTLPixelFormat formats[] = {MTLPixelFormatR32Uint, + MTLPixelFormatRG32Uint, + MTLPixelFormatInvalid, + MTLPixelFormatRGBA32Uint}; + format = formats[mem.data_elements - 1]; + } break; + case TYPE_INT: { + MTLPixelFormat formats[] = {MTLPixelFormatR32Sint, + MTLPixelFormatRG32Sint, + MTLPixelFormatInvalid, + MTLPixelFormatRGBA32Sint}; + format = formats[mem.data_elements - 1]; + } break; + case TYPE_FLOAT: { + MTLPixelFormat formats[] = {MTLPixelFormatR32Float, + MTLPixelFormatRG32Float, + MTLPixelFormatInvalid, + MTLPixelFormatRGBA32Float}; + format = formats[mem.data_elements - 1]; + } break; + case TYPE_HALF: { + MTLPixelFormat formats[] = {MTLPixelFormatR16Float, + MTLPixelFormatRG16Float, + MTLPixelFormatInvalid, + MTLPixelFormatRGBA16Float}; + format = formats[mem.data_elements - 1]; + } break; + default: + assert(0); + return; + } + + assert(format != MTLPixelFormatInvalid); + + id mtlTexture = nil; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + + if (mem.data_depth > 1) { + /* 3D texture using array */ + MTLTextureDescriptor *desc; + + desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format + width:mem.data_width + height:mem.data_height + mipmapped:NO]; + + desc.storageMode = storage_mode; + desc.usage = MTLTextureUsageShaderRead; + + desc.textureType = MTLTextureType3D; + desc.depth = mem.data_depth; + + VLOG(2) << "Texture 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + mtlTexture = [mtlDevice newTextureWithDescriptor:desc]; + assert(mtlTexture); + + if (!mtlTexture) { + return; + } + + const size_t imageBytes = src_pitch * mem.data_height; + for (size_t d = 0; d < mem.data_depth; d++) { + const size_t offset = d * imageBytes; + [mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1) + mipmapLevel:0 + slice:0 + withBytes:(uint8_t *)mem.host_pointer + offset + bytesPerRow:src_pitch + bytesPerImage:0]; + } + } + else if (mem.data_height > 0) { + /* 2D texture */ + MTLTextureDescriptor *desc; + + desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format + width:mem.data_width + height:mem.data_height + mipmapped:NO]; + + desc.storageMode = storage_mode; + desc.usage = MTLTextureUsageShaderRead; + + VLOG(2) << "Texture 2D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + mtlTexture = [mtlDevice newTextureWithDescriptor:desc]; + assert(mtlTexture); + + [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height) + mipmapLevel:0 + withBytes:mem.host_pointer + bytesPerRow:src_pitch]; + } + else { + assert(0); + /* 1D texture, using linear memory. */ + } + + mem.device_pointer = (device_ptr)mtlTexture; + mem.device_size = size; + stats.mem_alloc(size); + + std::lock_guard lock(metal_mem_map_mutex); + MetalMem *mmem = new MetalMem; + metal_mem_map[&mem] = std::unique_ptr(mmem); + mmem->mem = &mem; + mmem->mtlTexture = mtlTexture; + + /* Resize once */ + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(slot + 128); + texture_slot_map.resize(slot + 128); + + ssize_t min_buffer_length = sizeof(void *) * texture_info.size(); + if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) { + if (texture_bindings_2d) { + delayed_free_list.push_back(texture_bindings_2d); + delayed_free_list.push_back(texture_bindings_3d); + + stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize); + } + texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length + options:default_storage_mode]; + texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length + options:default_storage_mode]; + + stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize); + } + } + + /* optimise the texture for GPU access */ + id commandBuffer = [mtlGeneralCommandQueue commandBuffer]; + id blitCommandEncoder = [commandBuffer blitCommandEncoder]; + [blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture]; + [blitCommandEncoder endEncoding]; + [commandBuffer commit]; + + /* Set Mapping and tag that we need to (re-)upload to device */ + texture_slot_map[slot] = mtlTexture; + texture_info[slot] = mem.info; + need_texture_info = true; + + texture_info[slot].data = uint64_t(slot) | (sampler_index << 32); +} + +void MetalDevice::tex_free(device_texture &mem) +{ + if (metal_mem_map.count(&mem)) { + std::lock_guard lock(metal_mem_map_mutex); + MetalMem &mmem = *metal_mem_map.at(&mem); + + assert(texture_slot_map[mem.slot] == mmem.mtlTexture); + texture_slot_map[mem.slot] = nil; + + if (mmem.mtlTexture) { + /* Free bindless texture. */ + delayed_free_list.push_back(mmem.mtlTexture); + mmem.mtlTexture = nil; + } + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + metal_mem_map.erase(&mem); + } +} + +unique_ptr MetalDevice::gpu_queue_create() +{ + return make_unique(this); +} + +bool MetalDevice::should_use_graphics_interop() +{ + /* METAL_WIP - provide fast interop */ + return false; +} + +void MetalDevice::flush_delayed_free_list() +{ + /* free any Metal buffers that may have been freed by host while a command + * buffer was being generated. This function should be called after each + * completion of a command buffer */ + std::lock_guard lock(metal_mem_map_mutex); + for (auto &it : delayed_free_list) { + [it release]; + } + delayed_free_list.clear(); +} + +void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) +{ + if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) { + Device::build_bvh(bvh, progress, refit); + return; + } + + BVHMetal *bvh_metal = static_cast(bvh); + bvh_metal->motion_blur = motion_blur; + if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue, refit)) { + + if (@available(macos 11.0, *)) { + if (bvh->params.top_level) { + bvhMetalRT = bvh_metal; + } + } + } +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h new file mode 100644 index 00000000000..d4941627fd8 --- /dev/null +++ b/intern/cycles/device/metal/kernel.h @@ -0,0 +1,168 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_METAL + +# include "device/kernel.h" +# include + +CCL_NAMESPACE_BEGIN + +class MetalDevice; + +enum { + METALRT_FUNC_DEFAULT_TRI, + METALRT_FUNC_DEFAULT_BOX, + METALRT_FUNC_SHADOW_TRI, + METALRT_FUNC_SHADOW_BOX, + METALRT_FUNC_LOCAL_TRI, + METALRT_FUNC_LOCAL_BOX, + METALRT_FUNC_CURVE_RIBBON, + METALRT_FUNC_CURVE_RIBBON_SHADOW, + METALRT_FUNC_CURVE_ALL, + METALRT_FUNC_CURVE_ALL_SHADOW, + METALRT_FUNC_NUM +}; + +enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM }; + +/* Pipeline State Object types */ +enum { + /* A kernel that can be used with all scenes, supporting all features. + * It is slow to compile, but only needs to be compiled once and is then + * cached for future render sessions. This allows a render to get underway + * on the GPU quickly. + */ + PSO_GENERIC, + + /* A kernel that is relatively quick to compile, but is specialised for the + * scene being rendered. It only contains the functionality and even baked in + * constants for values that means it needs to be recompiled whenever a + * dependent setting is changed. The render performance of this kernel is + * significantly faster though, and justifies the extra compile time. + */ + /* METAL_WIP: This isn't used and will require more changes to enable. */ + PSO_SPECIALISED, + + PSO_NUM +}; + +const char *kernel_type_as_string(int kernel_type); + +struct MetalKernelPipeline { + void release() + { + if (pipeline) { + [pipeline release]; + pipeline = nil; + if (@available(macOS 11.0, *)) { + for (int i = 0; i < METALRT_TABLE_NUM; i++) { + if (intersection_func_table[i]) { + [intersection_func_table[i] release]; + intersection_func_table[i] = nil; + } + } + } + } + if (function) { + [function release]; + function = nil; + } + if (@available(macOS 11.0, *)) { + for (int i = 0; i < METALRT_TABLE_NUM; i++) { + if (intersection_func_table[i]) { + [intersection_func_table[i] release]; + } + } + } + } + + bool loaded = false; + id function = nil; + id pipeline = nil; + + API_AVAILABLE(macos(11.0)) + id intersection_func_table[METALRT_TABLE_NUM] = {nil}; +}; + +struct MetalKernelLoadDesc { + int pso_index = 0; + const char *function_name = nullptr; + int kernel_index = 0; + int threads_per_threadgroup = 0; + MTLFunctionConstantValues *constant_values = nullptr; + NSArray *linked_functions = nullptr; + + struct IntersectorFunctions { + NSArray *defaults; + NSArray *shadow; + NSArray *local; + NSArray *operator[](int index) const + { + if (index == METALRT_TABLE_DEFAULT) + return defaults; + if (index == METALRT_TABLE_SHADOW) + return shadow; + return local; + } + } intersector_functions = {nullptr}; +}; + +/* Metal kernel and associate occupancy information. */ +class MetalDeviceKernel { + public: + ~MetalDeviceKernel(); + + bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5); + + void mark_loaded(int pso_index) + { + pso[pso_index].loaded = true; + } + + int get_num_threads_per_block() const + { + return num_threads_per_block; + } + const MetalKernelPipeline &get_pso() const; + + double load_duration = 0.0; + + private: + MetalKernelPipeline pso[PSO_NUM]; + + int num_threads_per_block = 0; +}; + +/* Cache of Metal kernels for each DeviceKernel. */ +class MetalDeviceKernels { + public: + bool load(MetalDevice *device, int kernel_type); + bool available(DeviceKernel kernel) const; + const MetalDeviceKernel &get(DeviceKernel kernel) const; + + MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM]; + + id rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}}; + + string loaded_md5[PSO_NUM]; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_METAL */ diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm new file mode 100644 index 00000000000..9cbad10bf5a --- /dev/null +++ b/intern/cycles/device/metal/kernel.mm @@ -0,0 +1,523 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_METAL + +# include "device/metal/kernel.h" +# include "device/metal/device_impl.h" +# include "util/md5.h" +# include "util/path.h" +# include "util/tbb.h" +# include "util/time.h" + +CCL_NAMESPACE_BEGIN + +/* limit to 2 MTLCompiler instances */ +int max_mtlcompiler_threads = 2; + +const char *kernel_type_as_string(int kernel_type) +{ + switch (kernel_type) { + case PSO_GENERIC: + return "PSO_GENERIC"; + case PSO_SPECIALISED: + return "PSO_SPECIALISED"; + default: + assert(0); + } + return ""; +} + +MetalDeviceKernel::~MetalDeviceKernel() +{ + for (int i = 0; i < PSO_NUM; i++) { + pso[i].release(); + } +} + +bool MetalDeviceKernel::load(MetalDevice *device, + MetalKernelLoadDesc const &desc_in, + MD5Hash const &md5) +{ + __block MetalKernelLoadDesc const desc(desc_in); + if (desc.kernel_index == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + /* skip megakernel */ + return true; + } + + bool use_binary_archive = true; + if (getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { + use_binary_archive = false; + } + + id archive = nil; + string metalbin_path; + if (use_binary_archive) { + NSProcessInfo *processInfo = [NSProcessInfo processInfo]; + string osVersion = [[processInfo operatingSystemVersionString] UTF8String]; + MD5Hash local_md5(md5); + local_md5.append(osVersion); + string metalbin_name = string(desc.function_name) + "." + local_md5.get_hex() + + to_string(desc.pso_index) + ".bin"; + metalbin_path = path_cache_get(path_join("kernels", metalbin_name)); + path_create_directories(metalbin_path); + + if (path_exists(metalbin_path) && use_binary_archive) { + if (@available(macOS 11.0, *)) { + MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; + archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())]; + archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil]; + [archiveDesc release]; + } + } + } + + NSString *entryPoint = [@(desc.function_name) copy]; + + NSError *error = NULL; + if (@available(macOS 11.0, *)) { + MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; + func_desc.name = entryPoint; + if (desc.constant_values) { + func_desc.constantValues = desc.constant_values; + } + pso[desc.pso_index].function = [device->mtlLibrary[desc.pso_index] + newFunctionWithDescriptor:func_desc + error:&error]; + } + [entryPoint release]; + + if (pso[desc.pso_index].function == nil) { + NSString *err = [error localizedDescription]; + string errors = [err UTF8String]; + + device->set_error( + string_printf("Error getting function \"%s\": %s", desc.function_name, errors.c_str())); + return false; + } + + pso[desc.pso_index].function.label = [@(desc.function_name) copy]; + + __block MTLComputePipelineDescriptor *computePipelineStateDescriptor = + [[MTLComputePipelineDescriptor alloc] init]; + + computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable; + computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable; + computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable; + + computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup; + computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true; + + computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function; + if (@available(macOS 11.0, *)) { + /* Attach the additional functions to an MTLLinkedFunctions object */ + if (desc.linked_functions) { + computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init]; + computePipelineStateDescriptor.linkedFunctions.functions = desc.linked_functions; + } + + computePipelineStateDescriptor.maxCallStackDepth = 1; + } + + /* Create a new Compute pipeline state object */ + MTLPipelineOption pipelineOptions = MTLPipelineOptionNone; + + bool creating_new_archive = false; + if (@available(macOS 11.0, *)) { + if (use_binary_archive) { + if (!archive) { + MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; + archiveDesc.url = nil; + archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil]; + creating_new_archive = true; + + double starttime = time_dt(); + + if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor + error:&error]) { + NSString *errStr = [error localizedDescription]; + metal_printf("Failed to add PSO to archive:\n%s\n", + errStr ? [errStr UTF8String] : "nil"); + } + else { + double duration = time_dt() - starttime; + metal_printf("%2d | %-55s | %7.2fs\n", + desc.kernel_index, + device_kernel_as_string((DeviceKernel)desc.kernel_index), + duration); + + if (desc.pso_index == PSO_GENERIC) { + this->load_duration = duration; + } + } + } + computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil]; + pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss; + } + } + + double starttime = time_dt(); + + MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^( + id computePipelineState, + MTLComputePipelineReflection *reflection, + NSError *error) { + bool recreate_archive = false; + if (computePipelineState == nil && archive && !creating_new_archive) { + + assert(0); + + NSString *errStr = [error localizedDescription]; + metal_printf( + "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... " + "(error: %s)\n", + device_kernel_as_string((DeviceKernel)desc.kernel_index), + errStr ? [errStr UTF8String] : "nil"); + computePipelineState = [device->mtlDevice + newComputePipelineStateWithDescriptor:computePipelineStateDescriptor + options:MTLPipelineOptionNone + reflection:nullptr + error:&error]; + recreate_archive = true; + } + + double duration = time_dt() - starttime; + + if (computePipelineState == nil) { + NSString *errStr = [error localizedDescription]; + device->set_error(string_printf("Failed to create compute pipeline state \"%s\", error: \n", + device_kernel_as_string((DeviceKernel)desc.kernel_index)) + + (errStr ? [errStr UTF8String] : "nil")); + metal_printf("%2d | %-55s | %7.2fs | FAILED!\n", + desc.kernel_index, + device_kernel_as_string((DeviceKernel)desc.kernel_index), + duration); + return; + } + + pso[desc.pso_index].pipeline = computePipelineState; + num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup, + computePipelineState.threadExecutionWidth); + num_threads_per_block = std::max(num_threads_per_block, + (int)computePipelineState.threadExecutionWidth); + + if (!use_binary_archive) { + metal_printf("%2d | %-55s | %7.2fs\n", + desc.kernel_index, + device_kernel_as_string((DeviceKernel)desc.kernel_index), + duration); + + if (desc.pso_index == PSO_GENERIC) { + this->load_duration = duration; + } + } + + if (@available(macOS 11.0, *)) { + if (creating_new_archive || recreate_archive) { + if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())] + error:&error]) { + metal_printf("Failed to save binary archive, error:\n%s\n", + [[error localizedDescription] UTF8String]); + } + } + } + + [computePipelineStateDescriptor release]; + computePipelineStateDescriptor = nil; + + if (device->use_metalrt && desc.linked_functions) { + for (int table = 0; table < METALRT_TABLE_NUM; table++) { + if (@available(macOS 11.0, *)) { + MTLIntersectionFunctionTableDescriptor *ift_desc = + [[MTLIntersectionFunctionTableDescriptor alloc] init]; + ift_desc.functionCount = desc.intersector_functions[table].count; + + pso[desc.pso_index].intersection_func_table[table] = [pso[desc.pso_index].pipeline + newIntersectionFunctionTableWithDescriptor:ift_desc]; + + /* Finally write the function handles into this pipeline's table */ + for (int i = 0; i < 2; i++) { + id handle = [pso[desc.pso_index].pipeline + functionHandleWithFunction:desc.intersector_functions[table][i]]; + [pso[desc.pso_index].intersection_func_table[table] setFunction:handle atIndex:i]; + } + } + } + } + + mark_loaded(desc.pso_index); + }; + + if (desc.pso_index == PSO_SPECIALISED) { + /* Asynchronous load */ + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ + NSError *error; + id pipeline = [device->mtlDevice + newComputePipelineStateWithDescriptor:computePipelineStateDescriptor + options:pipelineOptions + reflection:nullptr + error:&error]; + completionHandler(pipeline, nullptr, error); + }); + } + else { + /* Block on load to ensure we continue with a valid kernel function */ + id pipeline = [device->mtlDevice + newComputePipelineStateWithDescriptor:computePipelineStateDescriptor + options:pipelineOptions + reflection:nullptr + error:&error]; + completionHandler(pipeline, nullptr, error); + } + + return true; +} + +const MetalKernelPipeline &MetalDeviceKernel::get_pso() const +{ + if (pso[PSO_SPECIALISED].loaded) { + return pso[PSO_SPECIALISED]; + } + + assert(pso[PSO_GENERIC].loaded); + return pso[PSO_GENERIC]; +} + +bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type) +{ + bool any_error = false; + + MD5Hash md5; + + /* Build the function constant table */ + MTLFunctionConstantValues *constant_values = nullptr; + if (kernel_type == PSO_SPECIALISED) { + constant_values = [MTLFunctionConstantValues new]; + +# define KERNEL_FILM(_type, name) \ + [constant_values setConstantValue:&data.film.name \ + type:get_MTLDataType_##_type() \ + atIndex:KernelData_film_##name]; \ + md5.append((uint8_t *)&data.film.name, sizeof(data.film.name)); + +# define KERNEL_BACKGROUND(_type, name) \ + [constant_values setConstantValue:&data.background.name \ + type:get_MTLDataType_##_type() \ + atIndex:KernelData_background_##name]; \ + md5.append((uint8_t *)&data.background.name, sizeof(data.background.name)); + +# define KERNEL_INTEGRATOR(_type, name) \ + [constant_values setConstantValue:&data.integrator.name \ + type:get_MTLDataType_##_type() \ + atIndex:KernelData_integrator_##name]; \ + md5.append((uint8_t *)&data.integrator.name, sizeof(data.integrator.name)); + +# define KERNEL_BVH(_type, name) \ + [constant_values setConstantValue:&data.bvh.name \ + type:get_MTLDataType_##_type() \ + atIndex:KernelData_bvh_##name]; \ + md5.append((uint8_t *)&data.bvh.name, sizeof(data.bvh.name)); + + /* METAL_WIP: populate constant_values based on KernelData */ + assert(0); + /* + const KernelData &data = device->launch_params.data; + # include "kernel/types/background.h" + # include "kernel/types/bvh.h" + # include "kernel/types/film.h" + # include "kernel/types/integrator.h" + */ + } + + if (device->use_metalrt) { + if (@available(macOS 11.0, *)) { + /* create the id for each intersection function */ + const char *function_names[] = { + "__anyhit__cycles_metalrt_visibility_test_tri", + "__anyhit__cycles_metalrt_visibility_test_box", + "__anyhit__cycles_metalrt_shadow_all_hit_tri", + "__anyhit__cycles_metalrt_shadow_all_hit_box", + "__anyhit__cycles_metalrt_local_hit_tri", + "__anyhit__cycles_metalrt_local_hit_box", + "__intersection__curve_ribbon", + "__intersection__curve_ribbon_shadow", + "__intersection__curve_all", + "__intersection__curve_all_shadow", + }; + assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM); + + MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; + if (kernel_type == PSO_SPECIALISED) { + desc.constantValues = constant_values; + } + for (int i = 0; i < METALRT_FUNC_NUM; i++) { + const char *function_name = function_names[i]; + desc.name = [@(function_name) copy]; + + NSError *error = NULL; + rt_intersection_funcs[kernel_type][i] = [device->mtlLibrary[kernel_type] + newFunctionWithDescriptor:desc + error:&error]; + + if (rt_intersection_funcs[kernel_type][i] == nil) { + NSString *err = [error localizedDescription]; + string errors = [err UTF8String]; + + device->set_error(string_printf( + "Error getting intersection function \"%s\": %s", function_name, errors.c_str())); + any_error = true; + break; + } + + rt_intersection_funcs[kernel_type][i].label = [@(function_name) copy]; + } + } + } + md5.append(device->source_used_for_compile[kernel_type]); + + string hash = md5.get_hex(); + if (loaded_md5[kernel_type] == hash) { + return true; + } + + if (!any_error) { + NSArray *table_functions[METALRT_TABLE_NUM] = {nil}; + NSArray *function_list = nil; + + if (device->use_metalrt) { + id box_intersect_default = nil; + id box_intersect_shadow = nil; + if (device->kernel_features & KERNEL_FEATURE_HAIR) { + /* Add curve intersection programs. */ + if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) { + /* Slower programs for thick hair since that also slows down ribbons. + * Ideally this should not be needed. */ + box_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL]; + box_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW]; + } + else { + box_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON]; + box_intersect_shadow = + rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW]; + } + } + table_functions[METALRT_TABLE_DEFAULT] = [NSArray + arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI], + box_intersect_default ? + box_intersect_default : + rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX], + nil]; + table_functions[METALRT_TABLE_SHADOW] = [NSArray + arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI], + box_intersect_shadow ? + box_intersect_shadow : + rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX], + nil]; + table_functions[METALRT_TABLE_LOCAL] = [NSArray + arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI], + rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX], + nil]; + + NSMutableSet *unique_functions = [NSMutableSet + setWithArray:table_functions[METALRT_TABLE_DEFAULT]]; + [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]]; + [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]]; + + function_list = [[NSArray arrayWithArray:[unique_functions allObjects]] + sortedArrayUsingComparator:^NSComparisonResult(id f1, id f2) { + return [f1.label compare:f2.label]; + }]; + + unique_functions = nil; + } + + metal_printf("Starting %s \"cycles_metal_...\" pipeline builds\n", + kernel_type_as_string(kernel_type)); + + tbb::task_arena local_arena(max_mtlcompiler_threads); + local_arena.execute([&]() { + tbb::parallel_for(int(0), int(DEVICE_KERNEL_NUM), [&](int i) { + /* skip megakernel */ + if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + return; + } + + /* Only specialise kernels where it can make an impact */ + if (kernel_type == PSO_SPECIALISED) { + if (i < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + return; + } + } + + MetalDeviceKernel &kernel = kernels_[i]; + + const std::string function_name = std::string("cycles_metal_") + + device_kernel_as_string((DeviceKernel)i); + int threads_per_threadgroup = device->max_threads_per_threadgroup; + if (i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && i < DEVICE_KERNEL_INTEGRATOR_RESET) { + /* Always use 512 for the sorting kernels */ + threads_per_threadgroup = 512; + } + + NSArray *kernel_function_list = nil; + + if (i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || + i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK || + i == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + kernel_function_list = function_list; + } + + MetalKernelLoadDesc desc; + desc.pso_index = kernel_type; + desc.kernel_index = i; + desc.linked_functions = kernel_function_list; + desc.intersector_functions.defaults = table_functions[METALRT_TABLE_DEFAULT]; + desc.intersector_functions.shadow = table_functions[METALRT_TABLE_SHADOW]; + desc.intersector_functions.local = table_functions[METALRT_TABLE_LOCAL]; + desc.constant_values = constant_values; + desc.threads_per_threadgroup = threads_per_threadgroup; + desc.function_name = function_name.c_str(); + + bool success = kernel.load(device, desc, md5); + + any_error |= !success; + }); + }); + } + + bool loaded = !any_error; + if (loaded) { + loaded_md5[kernel_type] = hash; + } + return loaded; +} + +const MetalDeviceKernel &MetalDeviceKernels::get(DeviceKernel kernel) const +{ + return kernels_[(int)kernel]; +} + +bool MetalDeviceKernels::available(DeviceKernel kernel) const +{ + return kernels_[(int)kernel].get_pso().function != nil; +} + +CCL_NAMESPACE_END + +#endif /* WITH_METAL*/ diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h new file mode 100644 index 00000000000..7aafcb2efe4 --- /dev/null +++ b/intern/cycles/device/metal/queue.h @@ -0,0 +1,97 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_METAL + +# include "device/kernel.h" +# include "device/memory.h" +# include "device/queue.h" + +# include "device/metal/util.h" +# include "kernel/device/metal/globals.h" + +# define metal_printf VLOG(4) << string_printf + +CCL_NAMESPACE_BEGIN + +class MetalDevice; + +/* Base class for Metal queues. */ +class MetalDeviceQueue : public DeviceQueue { + public: + MetalDeviceQueue(MetalDevice *device); + ~MetalDeviceQueue(); + + virtual int num_concurrent_states(const size_t) const override; + virtual int num_concurrent_busy_states() const override; + + virtual void init_execution() override; + + virtual bool enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) override; + + virtual bool synchronize() override; + + virtual void zero_to_device(device_memory &mem) override; + virtual void copy_to_device(device_memory &mem) override; + virtual void copy_from_device(device_memory &mem) override; + + virtual bool kernel_available(DeviceKernel kernel) const override; + + protected: + void prepare_resources(DeviceKernel kernel); + + id get_compute_encoder(DeviceKernel kernel); + id get_blit_encoder(); + + MetalDevice *metal_device; + MetalBufferPool temp_buffer_pool; + + API_AVAILABLE(macos(11.0), ios(14.0)) + MTLCommandBufferDescriptor *command_buffer_desc = nullptr; + id mtlDevice = nil; + id mtlCommandQueue = nil; + id mtlCommandBuffer = nil; + id mtlComputeEncoder = nil; + id mtlBlitEncoder = nil; + id shared_event = nil; + MTLSharedEventListener *shared_event_listener = nil; + + dispatch_queue_t event_queue; + dispatch_semaphore_t wait_semaphore; + + struct CopyBack { + void *host_pointer; + void *gpu_mem; + uint64_t size; + }; + std::vector copy_back_mem; + + uint64_t shared_event_id; + uint64_t command_buffers_submitted = 0; + uint64_t command_buffers_completed = 0; + Stats &stats; + + void close_compute_encoder(); + void close_blit_encoder(); +}; + +CCL_NAMESPACE_END + +#endif /* WITH_METAL */ diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm new file mode 100644 index 00000000000..2545fa0f3b0 --- /dev/null +++ b/intern/cycles/device/metal/queue.mm @@ -0,0 +1,602 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_METAL + +# include "device/metal/queue.h" + +# include "device/metal/device_impl.h" +# include "device/metal/kernel.h" + +# include "util/path.h" +# include "util/string.h" +# include "util/time.h" + +CCL_NAMESPACE_BEGIN + +/* MetalDeviceQueue */ + +MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device) + : DeviceQueue(device), metal_device(device), stats(device->stats) +{ + if (@available(macos 11.0, *)) { + command_buffer_desc = [[MTLCommandBufferDescriptor alloc] init]; + command_buffer_desc.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus; + } + + mtlDevice = device->mtlDevice; + mtlCommandQueue = [mtlDevice newCommandQueue]; + + shared_event = [mtlDevice newSharedEvent]; + shared_event_id = 1; + + /* Shareable event listener */ + event_queue = dispatch_queue_create("com.cycles.metal.event_queue", NULL); + shared_event_listener = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue]; + + wait_semaphore = dispatch_semaphore_create(0); +} + +MetalDeviceQueue::~MetalDeviceQueue() +{ + /* Tidying up here isn't really practical - we should expect and require the work + * queue to be empty here. */ + assert(mtlCommandBuffer == nil); + assert(command_buffers_submitted == command_buffers_completed); + + [shared_event_listener release]; + [shared_event release]; + + if (@available(macos 11.0, *)) { + [command_buffer_desc release]; + } + if (mtlCommandQueue) { + [mtlCommandQueue release]; + mtlCommandQueue = nil; + } +} + +int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +{ + /* METAL_WIP */ + /* TODO: compute automatically. */ + /* TODO: must have at least num_threads_per_block. */ + int result = 1048576; + if (metal_device->device_vendor == METAL_GPU_AMD) { + result *= 2; + } + else if (metal_device->device_vendor == METAL_GPU_APPLE) { + result *= 4; + } + return result; +} + +int MetalDeviceQueue::num_concurrent_busy_states() const +{ + /* METAL_WIP */ + /* TODO: compute automatically. */ + int result = 65536; + if (metal_device->device_vendor == METAL_GPU_AMD) { + result *= 2; + } + else if (metal_device->device_vendor == METAL_GPU_APPLE) { + result *= 4; + } + return result; +} + +void MetalDeviceQueue::init_execution() +{ + /* Synchronize all textures and memory copies before executing task. */ + metal_device->load_texture_info(); + + synchronize(); +} + +bool MetalDeviceQueue::enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) +{ + if (metal_device->have_error()) { + return false; + } + + VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size " + << work_size; + + const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel); + const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso(); + + id mtlComputeCommandEncoder = get_compute_encoder(kernel); + + /* Determine size requirement for argument buffer. */ + size_t arg_buffer_length = 0; + for (size_t i = 0; i < args.count; i++) { + size_t size_in_bytes = args.sizes[i]; + arg_buffer_length = round_up(arg_buffer_length, size_in_bytes) + size_in_bytes; + } + /* 256 is the Metal offset alignment for constant address space bindings */ + arg_buffer_length = round_up(arg_buffer_length, 256); + + /* Globals placed after "vanilla" arguments. */ + size_t globals_offsets = arg_buffer_length; + arg_buffer_length += sizeof(KernelParamsMetal); + arg_buffer_length = round_up(arg_buffer_length, 256); + + /* Metal ancilliary bindless pointers */ + size_t metal_offsets = arg_buffer_length; + arg_buffer_length += metal_device->mtlAncillaryArgEncoder.encodedLength; + arg_buffer_length = round_up(arg_buffer_length, metal_device->mtlAncillaryArgEncoder.alignment); + + /* Temporary buffer used to prepare arg_buffer */ + uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length); + memset(init_arg_buffer, 0, arg_buffer_length); + + /* Prepare the non-pointer "enqueue" arguments */ + size_t bytes_written = 0; + for (size_t i = 0; i < args.count; i++) { + size_t size_in_bytes = args.sizes[i]; + bytes_written = round_up(bytes_written, size_in_bytes); + if (args.types[i] != DeviceKernelArguments::POINTER) { + memcpy(init_arg_buffer + bytes_written, args.values[i], size_in_bytes); + } + bytes_written += size_in_bytes; + } + + /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */ + /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */ + size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, __integrator_state) + + sizeof(IntegratorStateGPU); + size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset; + memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset, + (uint8_t *)&metal_device->launch_params + plain_old_launch_data_offset, + plain_old_launch_data_size); + + /* Allocate an argument buffer. */ + MTLResourceOptions arg_buffer_options = MTLResourceStorageModeManaged; + if (@available(macOS 11.0, *)) { + if ([mtlDevice hasUnifiedMemory]) { + arg_buffer_options = MTLResourceStorageModeShared; + } + } + + id arg_buffer = temp_buffer_pool.get_buffer( + mtlDevice, mtlCommandBuffer, arg_buffer_length, arg_buffer_options, init_arg_buffer, stats); + + /* Encode the pointer "enqueue" arguments */ + bytes_written = 0; + for (size_t i = 0; i < args.count; i++) { + size_t size_in_bytes = args.sizes[i]; + bytes_written = round_up(bytes_written, size_in_bytes); + if (args.types[i] == DeviceKernelArguments::POINTER) { + [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer + offset:bytes_written]; + if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) { + [mtlComputeCommandEncoder useResource:mmem->mtlBuffer + usage:MTLResourceUsageRead | MTLResourceUsageWrite]; + [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0]; + } + else { + if (@available(macos 12.0, *)) { + [metal_device->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0]; + } + } + } + bytes_written += size_in_bytes; + } + + /* Encode KernelParamsMetal buffers */ + [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets]; + + /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */ + const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) + + sizeof(IntegratorStateGPU); + for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) { + int pointer_index = offset / sizeof(device_ptr); + MetalDevice::MetalMem *mmem = *( + MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset); + if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) { + [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer + offset:0 + atIndex:pointer_index]; + } + else { + if (@available(macos 12.0, *)) { + [metal_device->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:pointer_index]; + } + } + } + bytes_written = globals_offsets + sizeof(KernelParamsMetal); + + /* Encode ancillaries */ + [metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets]; + [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d + offset:0 + atIndex:0]; + [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_3d + offset:0 + atIndex:1]; + if (@available(macos 12.0, *)) { + if (metal_device->use_metalrt) { + if (metal_device->bvhMetalRT) { + id accel_struct = metal_device->bvhMetalRT->accel_struct; + [metal_device->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2]; + } + + for (int table = 0; table < METALRT_TABLE_NUM; table++) { + if (metal_kernel_pso.intersection_func_table[table]) { + [metal_kernel_pso.intersection_func_table[table] setBuffer:arg_buffer + offset:globals_offsets + atIndex:1]; + [metal_device->mtlAncillaryArgEncoder + setIntersectionFunctionTable:metal_kernel_pso.intersection_func_table[table] + atIndex:3 + table]; + [mtlComputeCommandEncoder useResource:metal_kernel_pso.intersection_func_table[table] + usage:MTLResourceUsageRead]; + } + else { + [metal_device->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil + atIndex:3 + table]; + } + } + } + bytes_written = metal_offsets + metal_device->mtlAncillaryArgEncoder.encodedLength; + } + + if (arg_buffer.storageMode == MTLStorageModeManaged) { + [arg_buffer didModifyRange:NSMakeRange(0, bytes_written)]; + } + + [mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0]; + [mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1]; + [mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2]; + + if (metal_device->use_metalrt) { + if (@available(macos 12.0, *)) { + + auto bvhMetalRT = metal_device->bvhMetalRT; + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + break; + default: + bvhMetalRT = nil; + break; + } + + if (bvhMetalRT) { + /* Mark all Accelerations resources as used */ + [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead]; + [mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data() + count:bvhMetalRT->blas_array.size() + usage:MTLResourceUsageRead]; + } + } + } + + [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline]; + + /* Compute kernel launch parameters. */ + const int num_threads_per_block = metal_kernel.get_num_threads_per_block(); + + int shared_mem_bytes = 0; + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: + /* See parallel_active_index.h for why this amount of shared memory is needed. + * Rounded up to 16 bytes for Metal */ + shared_mem_bytes = round_up((num_threads_per_block + 1) * sizeof(int), 16); + [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0]; + break; + + default: + break; + } + + MTLSize size_threadgroups_per_dispatch = MTLSizeMake( + divide_up(work_size, num_threads_per_block), 1, 1); + MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1); + [mtlComputeCommandEncoder dispatchThreadgroups:size_threadgroups_per_dispatch + threadsPerThreadgroup:size_threads_per_threadgroup]; + + [mtlCommandBuffer addCompletedHandler:^(id command_buffer) { + NSString *kernel_name = metal_kernel_pso.function.label; + + /* Enhanced command buffer errors are only available in 11.0+ */ + if (@available(macos 11.0, *)) { + if (command_buffer.status == MTLCommandBufferStatusError && command_buffer.error != nil) { + printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]); + NSArray> *encoderInfos = [command_buffer.error.userInfo + valueForKey:MTLCommandBufferEncoderInfoErrorKey]; + if (encoderInfos != nil) { + for (id encoderInfo : encoderInfos) { + NSLog(@"%@", encoderInfo); + } + } + id logs = command_buffer.logs; + for (id log in logs) { + NSLog(@"%@", log); + } + } + else if (command_buffer.error) { + printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]); + } + } + }]; + + return !(metal_device->have_error()); +} + +bool MetalDeviceQueue::synchronize() +{ + if (metal_device->have_error()) { + return false; + } + + if (mtlComputeEncoder) { + close_compute_encoder(); + } + close_blit_encoder(); + + if (mtlCommandBuffer) { + uint64_t shared_event_id = this->shared_event_id++; + + __block dispatch_semaphore_t block_sema = wait_semaphore; + [shared_event notifyListener:shared_event_listener + atValue:shared_event_id + block:^(id sharedEvent, uint64_t value) { + dispatch_semaphore_signal(block_sema); + }]; + + [mtlCommandBuffer encodeSignalEvent:shared_event value:shared_event_id]; + [mtlCommandBuffer commit]; + dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER); + + [mtlCommandBuffer release]; + + for (const CopyBack &mmem : copy_back_mem) { + memcpy((uchar *)mmem.host_pointer, (uchar *)mmem.gpu_mem, mmem.size); + } + copy_back_mem.clear(); + + temp_buffer_pool.process_command_buffer_completion(mtlCommandBuffer); + metal_device->flush_delayed_free_list(); + + mtlCommandBuffer = nil; + } + + return !(metal_device->have_error()); +} + +void MetalDeviceQueue::zero_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + metal_device->mem_alloc(mem); + } + + /* Zero memory on device. */ + assert(mem.device_pointer != 0); + + std::lock_guard lock(metal_device->metal_mem_map_mutex); + MetalDevice::MetalMem &mmem = *metal_device->metal_mem_map.at(&mem); + if (mmem.mtlBuffer) { + id blitEncoder = get_blit_encoder(); + [blitEncoder fillBuffer:mmem.mtlBuffer range:NSMakeRange(mmem.offset, mmem.size) value:0]; + } + else { + metal_device->mem_zero(mem); + } +} + +void MetalDeviceQueue::copy_to_device(device_memory &mem) +{ + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + metal_device->mem_alloc(mem); + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + std::lock_guard lock(metal_device->metal_mem_map_mutex); + auto result = metal_device->metal_mem_map.find(&mem); + if (result != metal_device->metal_mem_map.end()) { + if (mem.host_pointer == mem.shared_pointer) { + return; + } + + MetalDevice::MetalMem &mmem = *result->second; + id blitEncoder = get_blit_encoder(); + + id buffer = temp_buffer_pool.get_buffer(mtlDevice, + mtlCommandBuffer, + mmem.size, + MTLResourceStorageModeShared, + mem.host_pointer, + stats); + + [blitEncoder copyFromBuffer:buffer + sourceOffset:0 + toBuffer:mmem.mtlBuffer + destinationOffset:mmem.offset + size:mmem.size]; + } + else { + metal_device->mem_copy_to(mem); + } +} + +void MetalDeviceQueue::copy_from_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + std::lock_guard lock(metal_device->metal_mem_map_mutex); + MetalDevice::MetalMem &mmem = *metal_device->metal_mem_map.at(&mem); + if (mmem.mtlBuffer) { + const size_t size = mem.memory_size(); + + if (mem.device_pointer) { + if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) { + id blitEncoder = get_blit_encoder(); + [blitEncoder synchronizeResource:mmem.mtlBuffer]; + } + if (mem.host_pointer != mmem.hostPtr) { + if (mtlCommandBuffer) { + copy_back_mem.push_back({mem.host_pointer, mmem.hostPtr, size}); + } + else { + memcpy((uchar *)mem.host_pointer, (uchar *)mmem.hostPtr, size); + } + } + } + else { + memset((char *)mem.host_pointer, 0, size); + } + } + else { + metal_device->mem_copy_from(mem); + } +} + +bool MetalDeviceQueue::kernel_available(DeviceKernel kernel) const +{ + return metal_device->kernels.available(kernel); +} + +void MetalDeviceQueue::prepare_resources(DeviceKernel kernel) +{ + std::lock_guard lock(metal_device->metal_mem_map_mutex); + + /* declare resource usage */ + for (auto &it : metal_device->metal_mem_map) { + device_memory *mem = it.first; + + MTLResourceUsage usage = MTLResourceUsageRead; + if (mem->type != MEM_GLOBAL && mem->type != MEM_READ_ONLY && mem->type != MEM_TEXTURE) { + usage |= MTLResourceUsageWrite; + } + + if (it.second->mtlBuffer) { + /* METAL_WIP - use array version (i.e. useResources) */ + [mtlComputeEncoder useResource:it.second->mtlBuffer usage:usage]; + } + else if (it.second->mtlTexture) { + /* METAL_WIP - use array version (i.e. useResources) */ + [mtlComputeEncoder useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample]; + } + } + + /* ancillaries */ + [mtlComputeEncoder useResource:metal_device->texture_bindings_2d usage:MTLResourceUsageRead]; + [mtlComputeEncoder useResource:metal_device->texture_bindings_3d usage:MTLResourceUsageRead]; +} + +id MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel) +{ + bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM); + + if (mtlComputeEncoder) { + if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent : + MTLDispatchTypeSerial) { + /* declare usage of MTLBuffers etc */ + prepare_resources(kernel); + + return mtlComputeEncoder; + } + close_compute_encoder(); + } + + close_blit_encoder(); + + if (!mtlCommandBuffer) { + mtlCommandBuffer = [mtlCommandQueue commandBuffer]; + [mtlCommandBuffer retain]; + } + + mtlComputeEncoder = [mtlCommandBuffer + computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent : + MTLDispatchTypeSerial]; + + /* declare usage of MTLBuffers etc */ + prepare_resources(kernel); + + return mtlComputeEncoder; +} + +id MetalDeviceQueue::get_blit_encoder() +{ + if (mtlBlitEncoder) { + return mtlBlitEncoder; + } + + if (mtlComputeEncoder) { + close_compute_encoder(); + } + + if (!mtlCommandBuffer) { + mtlCommandBuffer = [mtlCommandQueue commandBuffer]; + [mtlCommandBuffer retain]; + } + + mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder]; + return mtlBlitEncoder; +} + +void MetalDeviceQueue::close_compute_encoder() +{ + [mtlComputeEncoder endEncoding]; + mtlComputeEncoder = nil; +} + +void MetalDeviceQueue::close_blit_encoder() +{ + if (mtlBlitEncoder) { + [mtlBlitEncoder endEncoding]; + mtlBlitEncoder = nil; + } +} + +CCL_NAMESPACE_END + +#endif /* WITH_METAL */ diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h new file mode 100644 index 00000000000..dbeb3a5d064 --- /dev/null +++ b/intern/cycles/device/metal/util.h @@ -0,0 +1,101 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_METAL + +# include +# include + +# include "device/metal/device.h" +# include "device/metal/kernel.h" +# include "device/queue.h" + +# include "util/thread.h" + +CCL_NAMESPACE_BEGIN + +enum MetalGPUVendor { + METAL_GPU_UNKNOWN = 0, + METAL_GPU_APPLE = 1, + METAL_GPU_AMD = 2, + METAL_GPU_INTEL = 3, +}; + +/* Retains a named MTLDevice for device enumeration. */ +struct MetalPlatformDevice { + MetalPlatformDevice(id device, const string &device_name) + : device_id(device), device_name(device_name) + { + [device_id retain]; + } + ~MetalPlatformDevice() + { + [device_id release]; + } + id device_id; + string device_name; +}; + +/* Contains static Metal helper functions. */ +struct MetalInfo { + static bool device_version_check(id device); + static void get_usable_devices(vector *usable_devices); + static MetalGPUVendor get_vendor_from_device_name(string const &device_name); + + /* Platform information. */ + static bool get_num_devices(uint32_t *num_platforms); + static uint32_t get_num_devices(); + + static bool get_device_name(id device_id, string *device_name); + static string get_device_name(id device_id); +}; + +/* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */ +class MetalBufferPool { + struct MetalBufferListEntry { + MetalBufferListEntry(id buffer, id command_buffer) + : buffer(buffer), command_buffer(command_buffer) + { + } + + MetalBufferListEntry() = delete; + + id buffer; + id command_buffer; + }; + std::vector buffer_free_list; + std::vector buffer_in_use_list; + thread_mutex buffer_mutex; + size_t total_temp_mem_size = 0; + + public: + MetalBufferPool() = default; + ~MetalBufferPool(); + + id get_buffer(id device, + id command_buffer, + NSUInteger length, + MTLResourceOptions options, + const void *pointer, + Stats &stats); + void process_command_buffer_completion(id command_buffer); +}; + +CCL_NAMESPACE_END + +#endif /* WITH_METAL */ diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm new file mode 100644 index 00000000000..d2143c389dc --- /dev/null +++ b/intern/cycles/device/metal/util.mm @@ -0,0 +1,241 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_METAL + +# include "device/metal/util.h" +# include "device/metal/device_impl.h" +# include "util/md5.h" +# include "util/path.h" +# include "util/string.h" +# include "util/time.h" + +# include +# include +# include + +CCL_NAMESPACE_BEGIN + +MetalGPUVendor MetalInfo::get_vendor_from_device_name(string const &device_name) +{ + if (device_name.find("Intel") != string::npos) { + return METAL_GPU_INTEL; + } + else if (device_name.find("AMD") != string::npos) { + return METAL_GPU_AMD; + } + else if (device_name.find("Apple") != string::npos) { + return METAL_GPU_APPLE; + } + return METAL_GPU_UNKNOWN; +} + +bool MetalInfo::device_version_check(id device) +{ + if (@available(macos 12.0, *)) { + MetalGPUVendor vendor = get_vendor_from_device_name([[device name] UTF8String]); + + static const char *forceIntelStr = getenv("CYCLES_METAL_FORCE_INTEL"); + bool forceIntel = forceIntelStr ? (atoi(forceIntelStr) != 0) : false; + + if (forceIntel) { + /* return false for non-Intel GPUs to force selection of Intel */ + if (vendor == METAL_GPU_INTEL) { + return true; + } + } + else { + switch (vendor) { + case METAL_GPU_INTEL: + /* isLowPower only returns true on machines that have an AMD GPU also + * For Intel only machines - isLowPower will return false + */ + if (getenv("CYCLES_METAL_ALLOW_LOW_POWER_GPUS") || !device.isLowPower) { + return true; + } + return false; + case METAL_GPU_APPLE: + case METAL_GPU_AMD: + return true; + default: + return false; + } + } + } + + return false; +} + +void MetalInfo::get_usable_devices(vector *usable_devices) +{ + static bool first_time = true; +# define FIRST_VLOG(severity) \ + if (first_time) \ + VLOG(severity) + + usable_devices->clear(); + + NSArray> *allDevices = MTLCopyAllDevices(); + for (id device in allDevices) { + string device_name; + if (!get_device_name(device, &device_name)) { + FIRST_VLOG(2) << "Failed to get device name, ignoring."; + continue; + } + + static const char *forceIntelStr = getenv("CYCLES_METAL_FORCE_INTEL"); + bool forceIntel = forceIntelStr ? (atoi(forceIntelStr) != 0) : false; + if (forceIntel && device_name.find("Intel") == string::npos) { + FIRST_VLOG(2) << "CYCLES_METAL_FORCE_INTEL causing non-Intel device " << device_name + << " to be ignored."; + continue; + } + + if (!device_version_check(device)) { + FIRST_VLOG(2) << "Ignoring device " << device_name << " due to too old compiler version."; + continue; + } + FIRST_VLOG(2) << "Adding new device " << device_name << "."; + string hardware_id; + usable_devices->push_back(MetalPlatformDevice(device, device_name)); + } + first_time = false; +} + +bool MetalInfo::get_num_devices(uint32_t *num_devices) +{ + *num_devices = MTLCopyAllDevices().count; + return true; +} + +uint32_t MetalInfo::get_num_devices() +{ + uint32_t num_devices; + if (!get_num_devices(&num_devices)) { + return 0; + } + return num_devices; +} + +bool MetalInfo::get_device_name(id device, string *platform_name) +{ + *platform_name = [device.name UTF8String]; + return true; +} + +string MetalInfo::get_device_name(id device) +{ + string platform_name; + if (!get_device_name(device, &platform_name)) { + return ""; + } + return platform_name; +} + +id MetalBufferPool::get_buffer(id device, + id command_buffer, + NSUInteger length, + MTLResourceOptions options, + const void *pointer, + Stats &stats) +{ + id buffer; + + MTLStorageMode storageMode = MTLStorageMode((options & MTLResourceStorageModeMask) >> + MTLResourceStorageModeShift); + MTLCPUCacheMode cpuCacheMode = MTLCPUCacheMode((options & MTLResourceCPUCacheModeMask) >> + MTLResourceCPUCacheModeShift); + + buffer_mutex.lock(); + for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end(); entry++) { + MetalBufferListEntry bufferEntry = *entry; + + /* Check if buffer matches size and storage mode and is old enough to reuse */ + if (bufferEntry.buffer.length == length && storageMode == bufferEntry.buffer.storageMode && + cpuCacheMode == bufferEntry.buffer.cpuCacheMode) { + buffer = bufferEntry.buffer; + buffer_free_list.erase(entry); + bufferEntry.command_buffer = command_buffer; + buffer_in_use_list.push_back(bufferEntry); + buffer_mutex.unlock(); + + /* Copy over data */ + if (pointer) { + memcpy(buffer.contents, pointer, length); + if (bufferEntry.buffer.storageMode == MTLStorageModeManaged) { + [buffer didModifyRange:NSMakeRange(0, length)]; + } + } + + return buffer; + } + } + // NSLog(@"Creating buffer of length %lu (%lu)", length, frameCount); + if (pointer) { + buffer = [device newBufferWithBytes:pointer length:length options:options]; + } + else { + buffer = [device newBufferWithLength:length options:options]; + } + + MetalBufferListEntry buffer_entry(buffer, command_buffer); + + stats.mem_alloc(buffer.allocatedSize); + + total_temp_mem_size += buffer.allocatedSize; + buffer_in_use_list.push_back(buffer_entry); + buffer_mutex.unlock(); + + return buffer; +} + +void MetalBufferPool::process_command_buffer_completion(id command_buffer) +{ + assert(command_buffer); + thread_scoped_lock lock(buffer_mutex); + /* Release all buffers that have not been recently reused back into the free pool */ + for (auto entry = buffer_in_use_list.begin(); entry != buffer_in_use_list.end();) { + MetalBufferListEntry buffer_entry = *entry; + if (buffer_entry.command_buffer == command_buffer) { + entry = buffer_in_use_list.erase(entry); + buffer_entry.command_buffer = nil; + buffer_free_list.push_back(buffer_entry); + } + else { + entry++; + } + } +} + +MetalBufferPool::~MetalBufferPool() +{ + thread_scoped_lock lock(buffer_mutex); + /* Release all buffers that have not been recently reused */ + for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end();) { + MetalBufferListEntry buffer_entry = *entry; + + id buffer = buffer_entry.buffer; + // NSLog(@"Releasing buffer of length %lu (%lu) (%lu outstanding)", buffer.length, frameCount, + // bufferFreeList.size()); + total_temp_mem_size -= buffer.allocatedSize; + [buffer release]; + entry = buffer_free_list.erase(entry); + } +} + +CCL_NAMESPACE_END + +#endif /* WITH_METAL */ diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp index 2513df63489..baab84f4035 100644 --- a/intern/cycles/device/multi/device.cpp +++ b/intern/cycles/device/multi/device.cpp @@ -182,6 +182,9 @@ class MultiDevice : public Device { else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX : BVH_LAYOUT_EMBREE; + else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE) + params.bvh_layout = sub.device->info.type == DEVICE_METAL ? BVH_LAYOUT_METAL : + BVH_LAYOUT_EMBREE; /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree * (since they are put into the top level directly, see bvh_embree.cpp) */ diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 0f88063e3b7..b50f492e8c7 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -19,7 +19,6 @@ #include "kernel/device/gpu/parallel_active_index.h" #include "kernel/device/gpu/parallel_prefix_sum.h" #include "kernel/device/gpu/parallel_sorted_index.h" -#include "kernel/device/gpu/work_stealing.h" #include "kernel/sample/lcg.h" @@ -30,6 +29,8 @@ # include "kernel/device/metal/context_begin.h" #endif +#include "kernel/device/gpu/work_stealing.h" + #include "kernel/integrator/state.h" #include "kernel/integrator/state_flow.h" #include "kernel/integrator/state_util.h" @@ -96,7 +97,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int state = tile->path_index_offset + tile_work_index; uint x, y, sample; - get_work_pixel(tile, tile_work_index, &x, &y, &sample); + ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample)); ccl_gpu_kernel_call( integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample)); @@ -127,7 +128,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int state = tile->path_index_offset + tile_work_index; uint x, y, sample; - get_work_pixel(tile, tile_work_index, &x, &y, &sample); + ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample)); ccl_gpu_kernel_call( integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample)); diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index a80965ba267..a51afc37fc0 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -117,7 +117,7 @@ struct kernel_gpu_##name \ uint simd_group_index, \ uint num_simd_groups) ccl_global const; \ }; \ -kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \ +kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \ constant KernelParamsMetal &ccl_restrict _launch_params_metal, \ constant MetalAncillaries *_metal_ancillaries, \ threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \ diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal index ba80238bb84..27dc1f44c6f 100644 --- a/intern/cycles/kernel/device/metal/kernel.metal +++ b/intern/cycles/kernel/device/metal/kernel.metal @@ -126,7 +126,7 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal, [[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult -__anyhit__kernel_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], +__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]], uint instance_id [[user_instance_id]], uint primitive_id [[primitive_id]], @@ -139,7 +139,7 @@ __anyhit__kernel_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params [[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult -__anyhit__kernel_metalrt_local_hit_box(const float ray_tmax [[max_distance]]) +__anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]]) { /* unused function */ BoundingBoxIntersectionResult result; @@ -274,7 +274,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal, [[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult -__anyhit__kernel_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], +__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], unsigned int object [[user_instance_id]], unsigned int primitive_id [[primitive_id]], @@ -292,7 +292,7 @@ __anyhit__kernel_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_p [[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult -__anyhit__kernel_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]]) +__anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]]) { /* unused function */ BoundingBoxIntersectionResult result; @@ -345,7 +345,7 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa [[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult -__anyhit__kernel_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], +__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], unsigned int object [[user_instance_id]], unsigned int primitive_id [[primitive_id]]) @@ -362,7 +362,7 @@ __anyhit__kernel_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_ [[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult -__anyhit__kernel_metalrt_visibility_test_box(const float ray_tmax [[max_distance]]) +__anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]]) { /* Unused function */ BoundingBoxIntersectionResult result; diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index 6cfeb1aa917..18b60b70a4b 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -719,6 +719,20 @@ ccl_device_inline float pow22(float a) return sqr(a * sqr(sqr(sqr(a)) * a)); } +#ifdef __KERNEL_METAL__ +ccl_device_inline float lgammaf(float x) +{ + /* Nemes, GergÅ‘ (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik + */ + const float _1_180 = 1.0f / 180.0f; + const float log2pi = 1.83787706641f; + const float logx = log(x); + return (log2pi - logx + + x * (logx * 2.0f + log(x * sinh(1.0f / x) + (_1_180 / pow(x, 6.0f))) - 2.0f)) * + 0.5f; +} +#endif + ccl_device_inline float beta(float x, float y) { return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y)); diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp index aad790482d5..243b3187f2e 100644 --- a/intern/cycles/util/path.cpp +++ b/intern/cycles/util/path.cpp @@ -750,6 +750,170 @@ bool path_remove(const string &path) return remove(path.c_str()) == 0; } +struct SourceReplaceState { + typedef map ProcessedMapping; + /* Base director for all relative include headers. */ + string base; + /* Result of processed files. */ + ProcessedMapping processed_files; + /* Set of files containing #pragma once which have been included. */ + set pragma_onced; +}; + +static string path_source_replace_includes_recursive(const string &source, + const string &source_filepath, + SourceReplaceState *state); + +static string path_source_handle_preprocessor(const string &preprocessor_line, + const string &source_filepath, + const size_t line_number, + SourceReplaceState *state) +{ + string result = preprocessor_line; + + string rest_of_line = string_strip(preprocessor_line.substr(1)); + + if (0 == strncmp(rest_of_line.c_str(), "include", 7)) { + rest_of_line = string_strip(rest_of_line.substr(8)); + if (rest_of_line[0] == '"') { + const size_t n_start = 1; + const size_t n_end = rest_of_line.find("\"", n_start); + const string filename = rest_of_line.substr(n_start, n_end - n_start); + + string filepath = path_join(state->base, filename); + if (!path_exists(filepath)) { + filepath = path_join(path_dirname(source_filepath), filename); + } + string text; + if (path_read_text(filepath, text)) { + text = path_source_replace_includes_recursive(text, filepath, state); + /* Use line directives for better error messages. */ + return "\n" + text + "\n"; + } + } + } + + return result; +} + +/* Our own little c preprocessor that replaces #includes with the file + * contents, to work around issue of OpenCL drivers not supporting + * include paths with spaces in them. + */ +static string path_source_replace_includes_recursive(const string &_source, + const string &source_filepath, + SourceReplaceState *state) +{ + const string *psource = &_source; + string source_new; + + auto pragma_once = _source.find("#pragma once"); + if (pragma_once != string::npos) { + if (state->pragma_onced.find(source_filepath) != state->pragma_onced.end()) { + return ""; + } + state->pragma_onced.insert(source_filepath); + + // "#pragma once" + // "//prgma once" + source_new = _source; + memcpy(source_new.data() + pragma_once, "//pr", 4); + psource = &source_new; + } + + /* Try to re-use processed file without spending time on replacing all + * include directives again. + */ + SourceReplaceState::ProcessedMapping::iterator replaced_file = state->processed_files.find( + source_filepath); + if (replaced_file != state->processed_files.end()) { + return replaced_file->second; + } + + const string &source = *psource; + + /* Perform full file processing. */ + string result = ""; + const size_t source_length = source.length(); + size_t index = 0; + /* Information about where we are in the source. */ + size_t line_number = 0, column_number = 1; + /* Currently gathered non-preprocessor token. + * Store as start/length rather than token itself to avoid overhead of + * memory re-allocations on each character concatenation. + */ + size_t token_start = 0, token_length = 0; + /* Denotes whether we're inside of preprocessor line, together with + * preprocessor line itself. + * + * TODO(sergey): Investigate whether using token start/end position + * gives measurable speedup. + */ + bool inside_preprocessor = false; + string preprocessor_line = ""; + /* Actual loop over the whole source. */ + while (index < source_length) { + char ch = source[index]; + + if (ch == '\n') { + if (inside_preprocessor) { + string block = path_source_handle_preprocessor( + preprocessor_line, source_filepath, line_number, state); + + if (!block.empty()) { + result += block; + } + + /* Start gathering net part of the token. */ + token_start = index; + token_length = 0; + inside_preprocessor = false; + preprocessor_line = ""; + } + column_number = 0; + ++line_number; + } + else if (ch == '#' && column_number == 1 && !inside_preprocessor) { + /* Append all possible non-preprocessor token to the result. */ + if (token_length != 0) { + result.append(source, token_start, token_length); + token_start = index; + token_length = 0; + } + inside_preprocessor = true; + } + + if (inside_preprocessor) { + preprocessor_line += ch; + } + else { + ++token_length; + } + ++index; + ++column_number; + } + /* Append possible tokens which happened before special events handled + * above. + */ + if (token_length != 0) { + result.append(source, token_start, token_length); + } + if (inside_preprocessor) { + result += path_source_handle_preprocessor( + preprocessor_line, source_filepath, line_number, state); + } + /* Store result for further reuse. */ + state->processed_files[source_filepath] = result; + return result; +} + +string path_source_replace_includes(const string &source, const string &path) +{ + SourceReplaceState state; + state.base = path; + return path_source_replace_includes_recursive(source, path, &state); +} + FILE *path_fopen(const string &path, const string &mode) { #ifdef _WIN32 diff --git a/intern/cycles/util/path.h b/intern/cycles/util/path.h index a1394555302..7ec5ed60d7f 100644 --- a/intern/cycles/util/path.h +++ b/intern/cycles/util/path.h @@ -66,6 +66,9 @@ bool path_read_text(const string &path, string &text); /* File manipulation. */ bool path_remove(const string &path); +/* source code utility */ +string path_source_replace_includes(const string &source, const string &path); + /* cache utility */ void path_cache_clear_except(const string &name, const set &except); -- cgit v1.2.3