diff options
Diffstat (limited to 'intern/cycles/device')
33 files changed, 1681 insertions, 309 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 6205775260a..24855d795d1 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -3,12 +3,9 @@ set(INC .. - ../../glew-mx ) -set(INC_SYS - ${GLEW_INCLUDE_DIR} -) +set(INC_SYS ) if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA) if(WITH_CUDA_DYNLOAD) @@ -22,6 +19,8 @@ if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA) ) add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}") endif() + + add_definitions(-DCYCLES_RUNTIME_OPTIX_ROOT_DIR="${CYCLES_RUNTIME_OPTIX_ROOT_DIR}") endif() if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD) @@ -82,6 +81,15 @@ set(SRC_HIP hip/util.h ) +set(SRC_ONEAPI + oneapi/device_impl.cpp + oneapi/device_impl.h + oneapi/device.cpp + oneapi/device.h + oneapi/queue.cpp + oneapi/queue.h +) + set(SRC_DUMMY dummy/device.cpp dummy/device.h @@ -134,13 +142,13 @@ set(SRC ${SRC_DUMMY} ${SRC_MULTI} ${SRC_OPTIX} + ${SRC_ONEAPI} ${SRC_HEADERS} ) set(LIB cycles_kernel cycles_util - ${CYCLES_GL_LIBRARIES} ) if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA) @@ -161,8 +169,6 @@ if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD) ) endif() -add_definitions(${GL_DEFINITIONS}) - if(WITH_CYCLES_DEVICE_CUDA) add_definitions(-DWITH_CUDA) endif() @@ -181,6 +187,9 @@ if(WITH_CYCLES_DEVICE_METAL) ${SRC_METAL} ) endif() +if (WITH_CYCLES_DEVICE_ONEAPI) + add_definitions(-DWITH_ONEAPI) +endif() if(WITH_OPENIMAGEDENOISE) list(APPEND LIB @@ -193,6 +202,11 @@ include_directories(SYSTEM ${INC_SYS}) cycles_add_library(cycles_device "${LIB}" ${SRC}) +if(WITH_CYCLES_DEVICE_ONEAPI) + # Need to have proper rebuilding in case of changes in cycles_kernel_oneapi due external project behaviour + add_dependencies(cycles_device cycles_kernel_oneapi) +endif() + source_group("cpu" FILES ${SRC_CPU}) source_group("cuda" FILES ${SRC_CUDA}) source_group("dummy" FILES ${SRC_DUMMY}) @@ -200,4 +214,5 @@ source_group("hip" FILES ${SRC_HIP}) source_group("multi" FILES ${SRC_MULTI}) source_group("metal" FILES ${SRC_METAL}) source_group("optix" FILES ${SRC_OPTIX}) +source_group("oneapi" FILES ${SRC_ONEAPI}) source_group("common" FILES ${SRC_BASE} ${SRC_HEADERS}) diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp index 612c391f7d5..1e4b9baa0c0 100644 --- a/intern/cycles/device/cpu/device_impl.cpp +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -51,12 +51,12 @@ CCL_NAMESPACE_BEGIN CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_) - : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL) + : Device(info_, stats_, profiler_), texture_info(this, "texture_info", MEM_GLOBAL) { /* Pick any kernel, all of them are supposed to have same level of microarchitecture * optimization. */ - VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name() - << " CPU kernels."; + VLOG_INFO << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name() + << " CPU kernels."; if (info.cpu_threads == 0) { info.cpu_threads = TaskScheduler::max_concurrency(); @@ -111,9 +111,9 @@ void CPUDevice::mem_alloc(device_memory &mem) } else { if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; } if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) { @@ -192,12 +192,12 @@ device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_ void CPUDevice::const_copy_to(const char *name, void *host, size_t size) { #ifdef WITH_EMBREE - if (strcmp(name, "__data") == 0) { + if (strcmp(name, "data") == 0) { assert(size <= sizeof(KernelData)); // Update scene handle (since it is different for each device on multi devices) KernelData *const data = (KernelData *)host; - data->bvh.scene = embree_scene; + data->device_bvh = embree_scene; } #endif kernel_const_copy(&kernel_globals, name, host, size); @@ -205,9 +205,9 @@ void CPUDevice::const_copy_to(const char *name, void *host, size_t size) void CPUDevice::global_alloc(device_memory &mem) { - VLOG(1) << "Global memory allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Global memory allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size); @@ -227,9 +227,9 @@ void CPUDevice::global_free(device_memory &mem) void CPUDevice::tex_alloc(device_texture &mem) { - VLOG(1) << "Texture allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Texture allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; mem.device_pointer = (device_ptr)mem.host_pointer; mem.device_size = mem.memory_size(); diff --git a/intern/cycles/device/cuda/device.cpp b/intern/cycles/device/cuda/device.cpp index 400490336d6..5a213c45b71 100644 --- a/intern/cycles/device/cuda/device.cpp +++ b/intern/cycles/device/cuda/device.cpp @@ -29,24 +29,25 @@ bool device_cuda_init() initialized = true; int cuew_result = cuewInit(CUEW_INIT_CUDA); if (cuew_result == CUEW_SUCCESS) { - VLOG(1) << "CUEW initialization succeeded"; + VLOG_INFO << "CUEW initialization succeeded"; if (CUDADevice::have_precompiled_kernels()) { - VLOG(1) << "Found precompiled kernels"; + VLOG_INFO << "Found precompiled kernels"; result = true; } else if (cuewCompilerPath() != NULL) { - VLOG(1) << "Found CUDA compiler " << cuewCompilerPath(); + VLOG_INFO << "Found CUDA compiler " << cuewCompilerPath(); result = true; } else { - VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found," - << " unable to use CUDA"; + VLOG_INFO << "Neither precompiled kernels nor CUDA compiler was found," + << " unable to use CUDA"; } } else { - VLOG(1) << "CUEW initialization failed: " - << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" : - "Error opening the library"); + VLOG_WARNING << "CUEW initialization failed: " + << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) ? + "Error setting up atexit() handler" : + "Error opening the library"); } return result; @@ -121,7 +122,8 @@ void device_cuda_info(vector<DeviceInfo> &devices) int major; cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num); if (major < 3) { - VLOG(1) << "Ignoring device \"" << name << "\", this graphics card is no longer supported."; + VLOG_INFO << "Ignoring device \"" << name + << "\", this graphics card is no longer supported."; continue; } @@ -166,21 +168,21 @@ void device_cuda_info(vector<DeviceInfo> &devices) * Windows 10 even when it is, due to an issue in application profiles. * Detect case where we expect it to be available and override. */ if (preempt_attr == 0 && (major >= 6) && system_windows_version_at_least(10, 17134)) { - VLOG(1) << "Assuming device has compute preemption on Windows 10."; + VLOG_INFO << "Assuming device has compute preemption on Windows 10."; preempt_attr = 1; } if (timeout_attr && !preempt_attr) { - VLOG(1) << "Device is recognized as display."; + VLOG_INFO << "Device is recognized as display."; info.description += " (Display)"; info.display_device = true; display_devices.push_back(info); } else { - VLOG(1) << "Device has compute preemption or is not used for display."; + VLOG_INFO << "Device has compute preemption or is not used for display."; devices.push_back(info); } - VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; + VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\"."; } if (!display_devices.empty()) diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index cb7e909a2d5..01c021551f3 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -23,6 +23,8 @@ # include "util/types.h" # include "util/windows.h" +# include "kernel/device/cuda/globals.h" + CCL_NAMESPACE_BEGIN class CUDADevice; @@ -51,7 +53,7 @@ void CUDADevice::set_error(const string &error) } CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) - : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) + : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL) { first_error = true; @@ -244,9 +246,9 @@ string CUDADevice::compile_kernel(const uint kernel_features, if (!use_adaptive_compilation()) { if (!force_ptx) { const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); - VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; + VLOG_INFO << "Testing for pre-compiled kernel " << cubin << "."; if (path_exists(cubin)) { - VLOG(1) << "Using precompiled kernel."; + VLOG_INFO << "Using precompiled kernel."; return cubin; } } @@ -256,9 +258,9 @@ string CUDADevice::compile_kernel(const uint kernel_features, while (ptx_major >= 3) { const string ptx = path_get( string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); - VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + VLOG_INFO << "Testing for pre-compiled kernel " << ptx << "."; if (path_exists(ptx)) { - VLOG(1) << "Using precompiled kernel."; + VLOG_INFO << "Using precompiled kernel."; return ptx; } @@ -287,9 +289,9 @@ string CUDADevice::compile_kernel(const uint kernel_features, const string cubin_file = string_printf( "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext); const string cubin = path_cache_get(path_join("kernels", cubin_file)); - VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; + VLOG_INFO << "Testing for locally compiled kernel " << cubin << "."; if (path_exists(cubin)) { - VLOG(1) << "Using locally compiled kernel."; + VLOG_INFO << "Using locally compiled kernel."; return cubin; } @@ -323,7 +325,7 @@ string CUDADevice::compile_kernel(const uint kernel_features, } const int nvcc_cuda_version = cuewCompilerVersion(); - VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << "."; + VLOG_INFO << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << "."; if (nvcc_cuda_version < 101) { printf( "Unsupported CUDA version %d.%d detected, " @@ -399,7 +401,8 @@ bool CUDADevice::load_kernels(const uint kernel_features) */ if (cuModule) { if (use_adaptive_compilation()) { - VLOG(1) << "Skipping CUDA kernel reload for adaptive compilation, not currently supported."; + VLOG_INFO + << "Skipping CUDA kernel reload for adaptive compilation, not currently supported."; } return true; } @@ -481,8 +484,8 @@ void CUDADevice::reserve_local_memory(const uint kernel_features) cuMemGetInfo(&free_after, &total); } - VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) - << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; + VLOG_INFO << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; # if 0 /* For testing mapped host memory, fill up device memory. */ @@ -513,7 +516,7 @@ void CUDADevice::init_host_memory() } } else { - VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM"; map_host_limit = 0; } @@ -524,8 +527,8 @@ void CUDADevice::init_host_memory() device_working_headroom = 32 * 1024 * 1024LL; // 32MB device_texture_headroom = 128 * 1024 * 1024LL; // 128MB - VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) - << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; + VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; } void CUDADevice::load_texture_info() @@ -593,7 +596,7 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture) * multiple CUDA devices could be moving the memory. The * first one will do it, and the rest will adopt the pointer. */ if (max_mem) { - VLOG(1) << "Move memory from device to host: " << max_mem->name; + VLOG_WORK << "Move memory from device to host: " << max_mem->name; static thread_mutex move_mutex; thread_scoped_lock lock(move_mutex); @@ -701,9 +704,9 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_ } if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" << status; + VLOG_WORK << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; } mem.device_pointer = (device_ptr)device_pointer; @@ -899,9 +902,19 @@ void CUDADevice::const_copy_to(const char *name, void *host, size_t size) CUdeviceptr mem; size_t bytes; - cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); - // assert(bytes == size); - cuda_assert(cuMemcpyHtoD(mem, host, size)); + cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, "kernel_params")); + assert(bytes == sizeof(KernelParamsCUDA)); + + /* Update data storage pointers in launch parameters. */ +# define KERNEL_DATA_ARRAY(data_type, data_name) \ + if (strcmp(name, #data_name) == 0) { \ + cuda_assert(cuMemcpyHtoD(mem + offsetof(KernelParamsCUDA, data_name), host, size)); \ + return; \ + } + KERNEL_DATA_ARRAY(KernelData, data) + KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state) +# include "kernel/data_arrays.h" +# undef KERNEL_DATA_ARRAY } void CUDADevice::global_alloc(device_memory &mem) @@ -925,7 +938,6 @@ void CUDADevice::tex_alloc(device_texture &mem) { CUDAContextScope scope(this); - string bind_name = mem.name; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); @@ -1008,9 +1020,9 @@ void CUDADevice::tex_alloc(device_texture &mem) desc.NumChannels = mem.data_elements; desc.Flags = 0; - VLOG(1) << "Array 3D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; cuda_assert(cuArray3DCreate(&array_3d, &desc)); @@ -1190,11 +1202,11 @@ bool CUDADevice::should_use_graphics_interop() } vector<CUdevice> gl_devices(num_all_devices); - uint num_gl_devices; + uint num_gl_devices = 0; cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL); - for (CUdevice gl_device : gl_devices) { - if (gl_device == cuDevice) { + for (uint i = 0; i < num_gl_devices; ++i) { + if (gl_devices[i] == cuDevice) { return true; } } diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index 38c71866ad0..5912e68a92b 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -39,12 +39,12 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const num_states = max((int)(num_states * factor), 1024); } else { - VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0"; + VLOG_DEVICE_STATS << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0"; } } - VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to " - << string_human_readable_size(num_states * state_size); + VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to " + << string_human_readable_size(num_states * state_size); return num_states; } diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index ea5b3c6dc8c..ace6ed517f5 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -16,6 +16,7 @@ #include "device/hip/device.h" #include "device/metal/device.h" #include "device/multi/device.h" +#include "device/oneapi/device.h" #include "device/optix/device.h" #include "util/foreach.h" @@ -39,6 +40,7 @@ vector<DeviceInfo> Device::optix_devices; vector<DeviceInfo> Device::cpu_devices; vector<DeviceInfo> Device::hip_devices; vector<DeviceInfo> Device::metal_devices; +vector<DeviceInfo> Device::oneapi_devices; uint Device::devices_initialized_mask = 0; /* Device */ @@ -101,6 +103,13 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler) device = device_metal_create(info, stats, profiler); break; #endif + +#ifdef WITH_ONEAPI + case DEVICE_ONEAPI: + device = device_oneapi_create(info, stats, profiler); + break; +#endif + default: break; } @@ -126,6 +135,8 @@ DeviceType Device::type_from_string(const char *name) return DEVICE_HIP; else if (strcmp(name, "METAL") == 0) return DEVICE_METAL; + else if (strcmp(name, "ONEAPI") == 0) + return DEVICE_ONEAPI; return DEVICE_NONE; } @@ -144,6 +155,8 @@ string Device::string_from_type(DeviceType type) return "HIP"; else if (type == DEVICE_METAL) return "METAL"; + else if (type == DEVICE_ONEAPI) + return "ONEAPI"; return ""; } @@ -164,6 +177,9 @@ vector<DeviceType> Device::available_types() #ifdef WITH_METAL types.push_back(DEVICE_METAL); #endif +#ifdef WITH_ONEAPI + types.push_back(DEVICE_ONEAPI); +#endif return types; } @@ -219,6 +235,20 @@ vector<DeviceInfo> Device::available_devices(uint mask) } #endif +#ifdef WITH_ONEAPI + if (mask & DEVICE_MASK_ONEAPI) { + if (!(devices_initialized_mask & DEVICE_MASK_ONEAPI)) { + if (device_oneapi_init()) { + device_oneapi_info(oneapi_devices); + } + devices_initialized_mask |= DEVICE_MASK_ONEAPI; + } + foreach (DeviceInfo &info, oneapi_devices) { + devices.push_back(info); + } + } +#endif + if (mask & DEVICE_MASK_CPU) { if (!(devices_initialized_mask & DEVICE_MASK_CPU)) { device_cpu_info(cpu_devices); @@ -282,6 +312,15 @@ string Device::device_capabilities(uint mask) } #endif +#ifdef WITH_ONEAPI + if (mask & DEVICE_MASK_ONEAPI) { + if (device_oneapi_init()) { + capabilities += "\noneAPI device capabilities:\n"; + capabilities += device_oneapi_capabilities(); + } + } +#endif + #ifdef WITH_METAL if (mask & DEVICE_MASK_METAL) { if (device_metal_init()) { @@ -325,8 +364,8 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, int orig_cpu_threads = (threads) ? threads : TaskScheduler::max_concurrency(); int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), size_t(0)); - VLOG(1) << "CPU render threads reduced from " << orig_cpu_threads << " to " << cpu_threads - << ", to dedicate to GPU."; + VLOG_INFO << "CPU render threads reduced from " << orig_cpu_threads << " to " + << cpu_threads << ", to dedicate to GPU."; if (cpu_threads >= 1) { DeviceInfo cpu_device = device; @@ -338,7 +377,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, } } else { - VLOG(1) << "CPU render threads disabled for interactive render."; + VLOG_INFO << "CPU render threads disabled for interactive render."; continue; } } @@ -380,6 +419,7 @@ void Device::free_memory() cuda_devices.free_memory(); optix_devices.free_memory(); hip_devices.free_memory(); + oneapi_devices.free_memory(); cpu_devices.free_memory(); metal_devices.free_memory(); } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 927caae600c..cdb13ca0a97 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -29,6 +29,7 @@ class DeviceQueue; class Progress; class CPUKernels; class CPUKernelThreadGlobals; +class Scene; /* Device Types */ @@ -40,6 +41,7 @@ enum DeviceType { DEVICE_OPTIX, DEVICE_HIP, DEVICE_METAL, + DEVICE_ONEAPI, DEVICE_DUMMY, }; @@ -49,6 +51,7 @@ enum DeviceTypeMask { DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX), DEVICE_MASK_HIP = (1 << DEVICE_HIP), DEVICE_MASK_METAL = (1 << DEVICE_METAL), + DEVICE_MASK_ONEAPI = (1 << DEVICE_ONEAPI), DEVICE_MASK_ALL = ~0 }; @@ -184,6 +187,11 @@ class Device { return 0; } + /* Called after kernel texture setup, and prior to integrator state setup. */ + virtual void optimize_for_scene(Scene * /*scene*/) + { + } + virtual bool is_resident(device_ptr /*key*/, Device *sub_device) { /* Memory is always resident if this is not a multi device, regardless of whether the pointer @@ -273,6 +281,7 @@ class Device { static vector<DeviceInfo> cpu_devices; static vector<DeviceInfo> hip_devices; static vector<DeviceInfo> metal_devices; + static vector<DeviceInfo> oneapi_devices; static uint devices_initialized_mask; }; diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp index d6a5ed9c419..3c9c73e7db0 100644 --- a/intern/cycles/device/hip/device.cpp +++ b/intern/cycles/device/hip/device.cpp @@ -29,30 +29,31 @@ bool device_hip_init() initialized = true; int hipew_result = hipewInit(HIPEW_INIT_HIP); if (hipew_result == HIPEW_SUCCESS) { - VLOG(1) << "HIPEW initialization succeeded"; + VLOG_INFO << "HIPEW initialization succeeded"; if (HIPDevice::have_precompiled_kernels()) { - VLOG(1) << "Found precompiled kernels"; + VLOG_INFO << "Found precompiled kernels"; result = true; } else if (hipewCompilerPath() != NULL) { - VLOG(1) << "Found HIPCC " << hipewCompilerPath(); + VLOG_INFO << "Found HIPCC " << hipewCompilerPath(); result = true; } else { - VLOG(1) << "Neither precompiled kernels nor HIPCC was found," - << " unable to use HIP"; + VLOG_INFO << "Neither precompiled kernels nor HIPCC was found," + << " unable to use HIP"; } } else { if (hipew_result == HIPEW_ERROR_ATEXIT_FAILED) { - VLOG(1) << "HIPEW initialization failed: Error setting up atexit() handler"; + VLOG_WARNING << "HIPEW initialization failed: Error setting up atexit() handler"; } else if (hipew_result == HIPEW_ERROR_OLD_DRIVER) { - VLOG(1) << "HIPEW initialization failed: Driver version too old, requires AMD Radeon Pro " - "21.Q4 driver or newer"; + VLOG_WARNING + << "HIPEW initialization failed: Driver version too old, requires AMD Radeon Pro " + "21.Q4 driver or newer"; } else { - VLOG(1) << "HIPEW initialization failed: Error opening HIP dynamic library"; + VLOG_WARNING << "HIPEW initialization failed: Error opening HIP dynamic library"; } } @@ -165,16 +166,16 @@ void device_hip_info(vector<DeviceInfo> &devices) hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num); if (timeout_attr && !preempt_attr) { - VLOG(1) << "Device is recognized as display."; + VLOG_INFO << "Device is recognized as display."; info.description += " (Display)"; info.display_device = true; display_devices.push_back(info); } else { - VLOG(1) << "Device has compute preemption or is not used for display."; + VLOG_INFO << "Device has compute preemption or is not used for display."; devices.push_back(info); } - VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; + VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\"."; } if (!display_devices.empty()) diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index ea68c821166..a84f1edd70e 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -16,7 +16,6 @@ # include "util/log.h" # include "util/map.h" # include "util/md5.h" -# include "util/opengl.h" # include "util/path.h" # include "util/string.h" # include "util/system.h" @@ -24,6 +23,8 @@ # include "util/types.h" # include "util/windows.h" +# include "kernel/device/hip/globals.h" + CCL_NAMESPACE_BEGIN class HIPDevice; @@ -52,7 +53,7 @@ void HIPDevice::set_error(const string &error) } HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) - : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) + : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL) { first_error = true; @@ -233,9 +234,9 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c /* Attempt to use kernel provided with Blender. */ if (!use_adaptive_compilation()) { const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, arch)); - VLOG(1) << "Testing for pre-compiled kernel " << fatbin << "."; + VLOG_INFO << "Testing for pre-compiled kernel " << fatbin << "."; if (path_exists(fatbin)) { - VLOG(1) << "Using precompiled kernel."; + VLOG_INFO << "Using precompiled kernel."; return fatbin; } } @@ -265,9 +266,9 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c const string include_path = source_path; const string fatbin_file = string_printf("cycles_%s_%s_%s", name, arch, kernel_md5.c_str()); const string fatbin = path_cache_get(path_join("kernels", fatbin_file)); - VLOG(1) << "Testing for locally compiled kernel " << fatbin << "."; + VLOG_INFO << "Testing for locally compiled kernel " << fatbin << "."; if (path_exists(fatbin)) { - VLOG(1) << "Using locally compiled kernel."; + VLOG_INFO << "Using locally compiled kernel."; return fatbin; } @@ -301,7 +302,7 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c } const int hipcc_hip_version = hipewCompilerVersion(); - VLOG(1) << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << "."; + VLOG_INFO << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << "."; if (hipcc_hip_version < 40) { printf( "Unsupported HIP version %d.%d detected, " @@ -361,7 +362,7 @@ bool HIPDevice::load_kernels(const uint kernel_features) */ if (hipModule) { if (use_adaptive_compilation()) { - VLOG(1) << "Skipping HIP kernel reload for adaptive compilation, not currently supported."; + VLOG_INFO << "Skipping HIP kernel reload for adaptive compilation, not currently supported."; } return true; } @@ -444,8 +445,8 @@ void HIPDevice::reserve_local_memory(const uint kernel_features) hipMemGetInfo(&free_after, &total); } - VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) - << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; + VLOG_INFO << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; # if 0 /* For testing mapped host memory, fill up device memory. */ @@ -476,7 +477,7 @@ void HIPDevice::init_host_memory() } } else { - VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM"; map_host_limit = 0; } @@ -487,8 +488,8 @@ void HIPDevice::init_host_memory() device_working_headroom = 32 * 1024 * 1024LL; // 32MB device_texture_headroom = 128 * 1024 * 1024LL; // 128MB - VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) - << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; + VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; } void HIPDevice::load_texture_info() @@ -556,7 +557,7 @@ void HIPDevice::move_textures_to_host(size_t size, bool for_texture) * multiple HIP devices could be moving the memory. The * first one will do it, and the rest will adopt the pointer. */ if (max_mem) { - VLOG(1) << "Move memory from device to host: " << max_mem->name; + VLOG_WORK << "Move memory from device to host: " << max_mem->name; static thread_mutex move_mutex; thread_scoped_lock lock(move_mutex); @@ -658,9 +659,9 @@ HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_pad } if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" << status; + VLOG_WORK << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; } mem.device_pointer = (device_ptr)device_pointer; @@ -856,8 +857,19 @@ void HIPDevice::const_copy_to(const char *name, void *host, size_t size) hipDeviceptr_t mem; size_t bytes; - hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name)); - hip_assert(hipMemcpyHtoD(mem, host, size)); + hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, "kernel_params")); + assert(bytes == sizeof(KernelParamsHIP)); + + /* Update data storage pointers in launch parameters. */ +# define KERNEL_DATA_ARRAY(data_type, data_name) \ + if (strcmp(name, #data_name) == 0) { \ + hip_assert(hipMemcpyHtoD(mem + offsetof(KernelParamsHIP, data_name), host, size)); \ + return; \ + } + KERNEL_DATA_ARRAY(KernelData, data) + KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state) +# include "kernel/data_arrays.h" +# undef KERNEL_DATA_ARRAY } void HIPDevice::global_alloc(device_memory &mem) @@ -881,7 +893,6 @@ void HIPDevice::tex_alloc(device_texture &mem) { HIPContextScope scope(this); - string bind_name = mem.name; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); @@ -966,9 +977,9 @@ void HIPDevice::tex_alloc(device_texture &mem) desc.NumChannels = mem.data_elements; desc.Flags = 0; - VLOG(1) << "Array 3D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; hip_assert(hipArray3DCreate((hArray *)&array_3d, &desc)); diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index 6c2c2c29624..8b3d963a32f 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -39,12 +39,12 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const num_states = max((int)(num_states * factor), 1024); } else { - VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0"; + VLOG_DEVICE_STATS << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0"; } } - VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to " - << string_human_readable_size(num_states * state_size); + VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to " + << string_human_readable_size(num_states * state_size); return num_states; } diff --git a/intern/cycles/device/hip/util.h b/intern/cycles/device/hip/util.h index adb68a2d44c..4e4906171d1 100644 --- a/intern/cycles/device/hip/util.h +++ b/intern/cycles/device/hip/util.h @@ -51,7 +51,7 @@ static inline bool hipSupportsDevice(const int hipDevId) hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); - return (major > 10) || (major == 10 && minor >= 1); + return (major >= 9); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h index 55d6d39cef8..5f44475077e 100644 --- a/intern/cycles/device/memory.h +++ b/intern/cycles/device/memory.h @@ -350,7 +350,7 @@ template<typename T> class device_only_memory : public device_memory { * * When using memory type MEM_GLOBAL, a pointer to this memory will be * automatically attached to kernel globals, using the provided name - * matching an entry in kernel_textures.h. */ + * matching an entry in kernel/data_arrays.h. */ template<typename T> class device_vector : public device_memory { public: diff --git a/intern/cycles/device/metal/device.mm b/intern/cycles/device/metal/device.mm index d7f190fc01e..51e3323370a 100644 --- a/intern/cycles/device/metal/device.mm +++ b/intern/cycles/device/metal/device.mm @@ -34,7 +34,8 @@ void device_metal_info(vector<DeviceInfo> &devices) int device_index = 0; for (id<MTLDevice> &device : usable_devices) { /* Compute unique ID for persistent user preferences. */ - string device_name = [device.name UTF8String]; + string device_name = MetalInfo::get_device_name(device); + string id = string("METAL_") + device_name; /* Hardware ID might not be unique, add device number in that case. */ @@ -48,12 +49,6 @@ void device_metal_info(vector<DeviceInfo> &devices) info.type = DEVICE_METAL; info.description = string_remove_trademark(string(device_name)); - /* Ensure unique naming on Apple Silicon / SoC devices which return the same string for CPU and - * GPU */ - if (info.description == system_cpu_brand_string()) { - info.description += " (GPU)"; - } - info.num = device_index; /* We don't know if it's used for display, but assume it is. */ info.display_device = true; @@ -69,14 +64,15 @@ string device_metal_capabilities() { string result = ""; auto allDevices = MTLCopyAllDevices(); - uint32_t num_devices = allDevices.count; + uint32_t num_devices = (uint32_t)allDevices.count; if (num_devices == 0) { return "No Metal devices found\n"; } result += string_printf("Number of devices: %u\n", num_devices); for (id<MTLDevice> device in allDevices) { - result += string_printf("\t\tDevice: %s\n", [device.name UTF8String]); + string device_name = MetalInfo::get_device_name(device); + result += string_printf("\t\tDevice: %s\n", device_name.c_str()); } return result; diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 0e6817d94f8..99e60d3a788 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -42,7 +42,6 @@ class MetalDevice : public Device { nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */ /*---------------------------------------------------*/ - string device_name; MetalGPUVendor device_vendor; uint kernel_features; @@ -76,7 +75,8 @@ class MetalDevice : public Device { std::vector<id<MTLTexture>> texture_slot_map; bool use_metalrt = false; - bool use_function_specialisation = false; + MetalPipelineType kernel_specialization_level = PSO_GENERIC; + std::atomic_bool async_compile_and_load = false; virtual BVHLayoutMask get_bvh_layout_mask() const override; @@ -92,9 +92,7 @@ class MetalDevice : public Device { bool use_adaptive_compilation(); - string get_source(const uint kernel_features); - - string compile_kernel(const uint kernel_features, const char *name); + void make_source(MetalPipelineType pso_type, const uint kernel_features); virtual bool load_kernels(const uint kernel_features) override; @@ -112,7 +110,9 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; - id<MTLLibrary> compile(string const &source); + virtual void optimize_for_scene(Scene *scene) override; + + bool compile_and_load(MetalPipelineType pso_type); /* ------------------------------------------------------------------ */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 086bf0af979..d1250b83d22 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -6,9 +6,12 @@ # include "device/metal/device_impl.h" # include "device/metal/device.h" +# include "scene/scene.h" + # include "util/debug.h" # include "util/md5.h" # include "util/path.h" +# include "util/time.h" CCL_NAMESPACE_BEGIN @@ -35,7 +38,7 @@ void MetalDevice::set_error(const string &error) } MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) - : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) + : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL) { mtlDevId = info.num; @@ -43,10 +46,9 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile auto usable_devices = MetalInfo::get_usable_devices(); assert(mtlDevId < usable_devices.size()); mtlDevice = usable_devices[mtlDevId]; - device_name = [mtlDevice.name UTF8String]; - device_vendor = MetalInfo::get_vendor_from_device_name(device_name); + device_vendor = MetalInfo::get_device_vendor(mtlDevice); assert(device_vendor != METAL_GPU_UNKNOWN); - metal_printf("Creating new Cycles device for Metal: %s\n", device_name.c_str()); + metal_printf("Creating new Cycles device for Metal: %s\n", info.description.c_str()); /* determine default storage mode based on whether UMA is supported */ @@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile case METAL_GPU_APPLE: { max_threads_per_threadgroup = 512; use_metalrt = info.use_metalrt; + + /* Specialize the intersection kernels on Apple GPUs by default as these can be built very + * quickly. */ + kernel_specialization_level = PSO_SPECIALIZED_INTERSECT; break; } } @@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile capture_enabled = true; } + if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) { + kernel_specialization_level = (MetalPipelineType)atoi(envstr); + } + metal_printf("kernel_specialization_level = %s\n", + kernel_type_as_string( + (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1))); + MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init]; arg_desc_params.dataType = MTLDataTypePointer; arg_desc_params.access = MTLArgumentAccessReadOnly; @@ -209,61 +222,86 @@ bool MetalDevice::use_adaptive_compilation() return DebugFlags().metal.adaptive_compile; } -string MetalDevice::get_source(const uint kernel_features) +void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features) { - string build_options; - + string global_defines; if (use_adaptive_compilation()) { - build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n"; } if (use_metalrt) { - build_options += "-D__METALRT__ "; + global_defines += "#define __METALRT__\n"; if (motion_blur) { - build_options += "-D__METALRT_MOTION__ "; + global_defines += "#define __METALRT_MOTION__\n"; } } # ifdef WITH_CYCLES_DEBUG - build_options += "-D__KERNEL_DEBUG__ "; + global_defines += "#define __KERNEL_DEBUG__\n"; # endif switch (device_vendor) { default: break; case METAL_GPU_INTEL: - build_options += "-D__KERNEL_METAL_INTEL__ "; + global_defines += "#define __KERNEL_METAL_INTEL__\n"; break; case METAL_GPU_AMD: - build_options += "-D__KERNEL_METAL_AMD__ "; + global_defines += "#define __KERNEL_METAL_AMD__\n"; break; case METAL_GPU_APPLE: - build_options += "-D__KERNEL_METAL_APPLE__ "; + global_defines += "#define __KERNEL_METAL_APPLE__\n"; break; } - /* reformat -D defines list into compilable form */ - vector<string> components; - string_replace(build_options, "-D", ""); - string_split(components, build_options, " "); + string &source = this->source[pso_type]; + source = "\n#include \"kernel/device/metal/kernel.metal\"\n"; + source = path_source_replace_includes(source, path_get("source")); - string globalDefines; - for (const string &component : components) { - vector<string> assignments; - string_split(assignments, component, "="); - if (assignments.size() == 2) - globalDefines += string_printf( - "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str()); - else - globalDefines += string_printf("#define %s\n", assignments[0].c_str()); + /* Perform any required specialization on the source. + * With Metal function constants we can generate a single variant of the kernel source which can + * be repeatedly respecialized. + */ + string baked_constants; + + /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of + * the same character length. Build a string of all active constant values which is then hashed + * in order to identify the PSO. + */ + if (pso_type != PSO_GENERIC) { + const double starttime = time_dt(); + +# define KERNEL_STRUCT_BEGIN(name, parent) \ + string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_"); + + /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */ +# define KERNEL_STRUCT_MEMBER(parent, _type, name) \ + baked_constants += string(#parent "." #name "=") + \ + to_string(_type(launch_params.data.parent.name)) + "\n"; + +# include "kernel/data_template.h" + + /* Opt in to all of available specializations. This can be made more granular for the + * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests, + * but the overhead should be negligible as these are very quick to (re)build and aren't + * serialized to disk via MTLBinaryArchives. + */ + global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n"; + + metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0); } - string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n"; - source = path_source_replace_includes(source, path_get("source")); - - metal_printf("Global defines:\n%s\n", globalDefines.c_str()); + source = global_defines + source; + metal_printf("================\n%s================\n\%s================\n", + global_defines.c_str(), + baked_constants.c_str()); - return source; + /* Generate an MD5 from the source and include any baked constants. This is used when caching + * PSOs. */ + MD5Hash md5; + md5.append(baked_constants); + md5.append(source); + source_md5[pso_type] = md5.get_hex(); } bool MetalDevice::load_kernels(const uint _kernel_features) @@ -279,24 +317,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features) * active, but may still need to be rendered without motion blur if that isn't active as well. */ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; - source[PSO_GENERIC] = get_source(kernel_features); - mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); - - MD5Hash md5; - md5.append(source[PSO_GENERIC]); - source_md5[PSO_GENERIC] = md5.get_hex(); - - metal_printf("Front-end compilation finished (generic)\n"); - - bool result = MetalDeviceKernels::load(this, false); + bool result = compile_and_load(PSO_GENERIC); reserve_local_memory(kernel_features); - return result; } -id<MTLLibrary> MetalDevice::compile(string const &source) +bool MetalDevice::compile_and_load(MetalPipelineType pso_type) { + make_source(pso_type, kernel_features); + + if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) { + /* We already have a full set of matching pipelines which are cached or queued. */ + metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type)); + return true; + } + MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; options.fastMathEnabled = YES; @@ -304,19 +340,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source) options.languageVersion = MTLLanguageVersion2_4; } + if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) { + path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))), + source[pso_type]); + } + + const double starttime = time_dt(); + NSError *error = NULL; - id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str()) - options:options - error:&error]; + mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str()) + options:options + error:&error]; - if (!mtlLibrary) { + if (!mtlLibrary[pso_type]) { NSString *err = [error localizedDescription]; set_error(string_printf("Failed to compile library:\n%s", [err UTF8String])); } + metal_printf("Front-end compilation finished in %.1f seconds (%s)\n", + time_dt() - starttime, + kernel_type_as_string(pso_type)); + [options release]; - return mtlLibrary; + return MetalDeviceKernels::load(this, pso_type); } void MetalDevice::reserve_local_memory(const uint kernel_features) @@ -411,9 +458,9 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) } if (mem.name) { - VLOG(2) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; } mem.device_size = metal_buffer.allocatedSize; @@ -623,11 +670,63 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz return 0; } +void MetalDevice::optimize_for_scene(Scene *scene) +{ + MetalPipelineType specialization_level = kernel_specialization_level; + + if (specialization_level < PSO_SPECIALIZED_INTERSECT) { + return; + } + + /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them + * synchronously. */ + compile_and_load(PSO_SPECIALIZED_INTERSECT); + + if (specialization_level < PSO_SPECIALIZED_SHADE) { + return; + } + if (!scene->params.background) { + /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to + * build. */ + return; + } + + /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and + * only if there isn't an existing load in flight. + */ + auto specialize_shade_fn = ^() { + compile_and_load(PSO_SPECIALIZED_SHADE); + async_compile_and_load = false; + }; + + bool async_specialize_shade = true; + + /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */ + if (getenv("CYCLES_METAL_PROFILING") != nullptr) { + async_specialize_shade = false; + } + + if (async_specialize_shade) { + if (!async_compile_and_load) { + async_compile_and_load = true; + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), + specialize_shade_fn); + } + else { + metal_printf( + "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n"); + } + } + else { + specialize_shade_fn(); + } +} + void MetalDevice::const_copy_to(const char *name, void *host, size_t size) { - if (strcmp(name, "__data") == 0) { + if (strcmp(name, "data") == 0) { assert(size == sizeof(KernelData)); - memcpy((uint8_t *)&launch_params + offsetof(KernelParamsMetal, data), host, size); + memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData)); return; } @@ -646,19 +745,19 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size) }; /* Update data storage pointers in launch parameters. */ - if (strcmp(name, "__integrator_state") == 0) { + if (strcmp(name, "integrator_state") == 0) { /* IntegratorStateGPU is contiguous pointers */ - const size_t pointer_block_size = sizeof(IntegratorStateGPU); + const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor); update_launch_pointers( - offsetof(KernelParamsMetal, __integrator_state), host, size, pointer_block_size); + offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size); } -# define KERNEL_TEX(data_type, tex_name) \ +# define KERNEL_DATA_ARRAY(data_type, tex_name) \ else if (strcmp(name, #tex_name) == 0) \ { \ update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \ } -# include "kernel/textures.h" -# undef KERNEL_TEX +# include "kernel/data_arrays.h" +# undef KERNEL_DATA_ARRAY } void MetalDevice::global_alloc(device_memory &mem) @@ -800,9 +899,9 @@ void MetalDevice::tex_alloc(device_texture &mem) desc.textureType = MTLTextureType3D; desc.depth = mem.data_depth; - VLOG(2) << "Texture 3D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Texture 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; mtlTexture = [mtlDevice newTextureWithDescriptor:desc]; assert(mtlTexture); @@ -834,9 +933,9 @@ void MetalDevice::tex_alloc(device_texture &mem) desc.storageMode = storage_mode; desc.usage = MTLTextureUsageShaderRead; - VLOG(2) << "Texture 2D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Texture 2D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; mtlTexture = [mtlDevice newTextureWithDescriptor:desc]; assert(mtlTexture); diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index 69b2a686ecc..11393f8b7e1 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -31,7 +31,7 @@ enum { enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM }; /* Pipeline State Object types */ -enum { +enum MetalPipelineType { /* A kernel that can be used with all scenes, supporting all features. * It is slow to compile, but only needs to be compiled once and is then * cached for future render sessions. This allows a render to get underway @@ -39,28 +39,33 @@ enum { */ PSO_GENERIC, - /* A kernel that is relatively quick to compile, but is specialized for the - * scene being rendered. It only contains the functionality and even baked in - * constants for values that means it needs to be recompiled whenever a - * dependent setting is changed. The render performance of this kernel is - * significantly faster though, and justifies the extra compile time. + /* A intersection kernel that is very quick to specialize and results in faster intersection + * kernel performance. It uses Metal function constants to replace several KernelData variables + * with fixed constants. + */ + PSO_SPECIALIZED_INTERSECT, + + /* A shading kernel that is slow to specialize, but results in faster shading kernel performance + * rendered. It uses Metal function constants to replace several KernelData variables with fixed + * constants and short-circuit all unused SVM node case handlers. */ - /* METAL_WIP: This isn't used and will require more changes to enable. */ - PSO_SPECIALISED, + PSO_SPECIALIZED_SHADE, PSO_NUM }; -const char *kernel_type_as_string(int kernel_type); +const char *kernel_type_as_string(MetalPipelineType pso_type); struct MetalKernelPipeline { void compile(); id<MTLLibrary> mtlLibrary = nil; - bool scene_specialized; + MetalPipelineType pso_type; string source_md5; + size_t usage_count = 0; + KernelData kernel_data_; bool use_metalrt; bool metalrt_hair; bool metalrt_hair_thick; @@ -75,6 +80,8 @@ struct MetalKernelPipeline { id<MTLComputePipelineState> pipeline = nil; int num_threads_per_block = 0; + bool should_use_binary_archive() const; + string error_str; API_AVAILABLE(macos(11.0)) @@ -85,7 +92,8 @@ struct MetalKernelPipeline { /* Cache of Metal kernels for each DeviceKernel. */ namespace MetalDeviceKernels { -bool load(MetalDevice *device, bool scene_specialized); +bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type); +bool load(MetalDevice *device, MetalPipelineType pso_type); const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel); } /* namespace MetalDeviceKernels */ diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index fec4cd80466..385cb412b06 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -5,6 +5,7 @@ # include "device/metal/kernel.h" # include "device/metal/device_impl.h" +# include "kernel/device/metal/function_constants.h" # include "util/md5.h" # include "util/path.h" # include "util/tbb.h" @@ -16,13 +17,15 @@ CCL_NAMESPACE_BEGIN /* limit to 2 MTLCompiler instances */ int max_mtlcompiler_threads = 2; -const char *kernel_type_as_string(int kernel_type) +const char *kernel_type_as_string(MetalPipelineType pso_type) { - switch (kernel_type) { + switch (pso_type) { case PSO_GENERIC: return "PSO_GENERIC"; - case PSO_SPECIALISED: - return "PSO_SPECIALISED"; + case PSO_SPECIALIZED_INTERSECT: + return "PSO_SPECIALIZED_INTERSECT"; + case PSO_SPECIALIZED_SHADE: + return "PSO_SPECIALIZED_SHADE"; default: assert(0); } @@ -50,7 +53,11 @@ struct ShaderCache { /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by * device. */ - void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized); + void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type); + + bool should_load_kernel(DeviceKernel device_kernel, + MetalDevice *device, + MetalPipelineType pso_type); void wait_for_all(); @@ -139,31 +146,34 @@ void ShaderCache::compile_thread_func(int thread_index) } } -void ShaderCache::load_kernel(DeviceKernel device_kernel, - MetalDevice *device, - bool scene_specialized) +bool ShaderCache::should_load_kernel(DeviceKernel device_kernel, + MetalDevice *device, + MetalPipelineType pso_type) { - { - /* create compiler threads on first run */ - thread_scoped_lock lock(cache_mutex); - if (compile_threads.empty()) { - running = true; - for (int i = 0; i < max_mtlcompiler_threads; i++) { - compile_threads.push_back(std::thread([&] { compile_thread_func(i); })); - } - } + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + /* Skip megakernel. */ + return false; } - if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - /* skip megakernel */ - return; + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) { + /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */ + return false; + } } - if (scene_specialized) { + if (pso_type != PSO_GENERIC) { /* Only specialize kernels where it can make an impact. */ if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - return; + return false; + } + + /* Only specialize shading / intersection kernels as requested. */ + bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE); + if (is_shade_pso != is_shade_kernel) { + return false; } } @@ -171,35 +181,45 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, /* check whether the kernel has already been requested / cached */ thread_scoped_lock lock(cache_mutex); for (auto &pipeline : pipelines[device_kernel]) { - if (scene_specialized) { - if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) { - /* we already requested a pipeline that is specialized for this kernel data */ - metal_printf("Specialized kernel already requested (%s)\n", - device_kernel_as_string(device_kernel)); - return; - } + if (pipeline->source_md5 == device->source_md5[pso_type]) { + return false; } - else { - if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) { - /* we already requested a generic pipeline for this kernel */ - metal_printf("Generic kernel already requested (%s)\n", - device_kernel_as_string(device_kernel)); - return; - } + } + } + + return true; +} + +void ShaderCache::load_kernel(DeviceKernel device_kernel, + MetalDevice *device, + MetalPipelineType pso_type) +{ + { + /* create compiler threads on first run */ + thread_scoped_lock lock(cache_mutex); + if (compile_threads.empty()) { + running = true; + for (int i = 0; i < max_mtlcompiler_threads; i++) { + compile_threads.push_back(std::thread([&] { compile_thread_func(i); })); } } } + if (!should_load_kernel(device_kernel, device, pso_type)) { + return; + } + incomplete_requests++; PipelineRequest request; request.pipeline = new MetalKernelPipeline; - request.pipeline->scene_specialized = scene_specialized; + memcpy(&request.pipeline->kernel_data_, + &device->launch_params.data, + sizeof(request.pipeline->kernel_data_)); + request.pipeline->pso_type = pso_type; request.pipeline->mtlDevice = mtlDevice; - request.pipeline->source_md5 = - device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC]; - request.pipeline->mtlLibrary = - device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC]; + request.pipeline->source_md5 = device->source_md5[pso_type]; + request.pipeline->mtlLibrary = device->mtlLibrary[pso_type]; request.pipeline->device_kernel = device_kernel; request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup; @@ -214,7 +234,24 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, { thread_scoped_lock lock(cache_mutex); - pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline)); + auto &collection = pipelines[device_kernel]; + + /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */ + int max_entries_of_same_pso_type = 3; + for (int i = (int)collection.size() - 1; i >= 0; i--) { + if (collection[i]->pso_type == pso_type) { + max_entries_of_same_pso_type -= 1; + if (max_entries_of_same_pso_type == 0) { + metal_printf("Purging oldest %s:%s kernel from ShaderCache\n", + kernel_type_as_string(pso_type), + device_kernel_as_string(device_kernel)); + collection.erase(collection.begin() + i); + break; + } + } + } + + collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline)); request_queue.push_back(request); } cond_var.notify_one(); @@ -248,8 +285,9 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M continue; } - if (pipeline->scene_specialized) { - if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) { + if (pipeline->pso_type != PSO_GENERIC) { + if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] || + pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) { best_pipeline = pipeline.get(); } } @@ -258,13 +296,65 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M } } + if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) { + metal_printf("Swapping in %s version of %s\n", + kernel_type_as_string(best_pipeline->pso_type), + device_kernel_as_string(kernel)); + } + best_pipeline->usage_count += 1; + return best_pipeline; } -void MetalKernelPipeline::compile() +bool MetalKernelPipeline::should_use_binary_archive() const { - int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC; + if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { + if (atoi(str) != 0) { + /* Don't archive if we have opted out by env var. */ + return false; + } + } + + if (pso_type == PSO_GENERIC) { + /* Archive the generic kernels. */ + return true; + } + + if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND && + device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) { + /* Archive all shade kernels - they take a long time to compile. */ + return true; + } + + /* The remaining kernels are all fast to compile. They may get cached by the system shader cache, + * but will be quick to regenerate if not. */ + return false; +} + +static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr) +{ + MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new]; + + MTLDataType MTLDataType_int = MTLDataTypeInt; + MTLDataType MTLDataType_float = MTLDataTypeFloat; + MTLDataType MTLDataType_float4 = MTLDataTypeFloat4; + KernelData zero_data = {0}; + if (!data) { + data = &zero_data; + } +# define KERNEL_STRUCT_MEMBER(parent, _type, name) \ + [constant_values setConstantValue:&data->parent.name \ + type:MTLDataType_##_type \ + atIndex:KernelData_##parent##_##name]; + +# include "kernel/data_template.h" + + return constant_values; +} + +void MetalKernelPipeline::compile() +{ const std::string function_name = std::string("cycles_metal_") + device_kernel_as_string(device_kernel); @@ -281,6 +371,17 @@ void MetalKernelPipeline::compile() if (@available(macOS 11.0, *)) { MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; func_desc.name = entryPoint; + + if (pso_type == PSO_SPECIALIZED_SHADE) { + func_desc.constantValues = GetConstantValues(&kernel_data_); + } + else if (pso_type == PSO_SPECIALIZED_INTERSECT) { + func_desc.constantValues = GetConstantValues(&kernel_data_); + } + else { + func_desc.constantValues = GetConstantValues(); + } + function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error]; } @@ -427,10 +528,7 @@ void MetalKernelPipeline::compile() MTLPipelineOption pipelineOptions = MTLPipelineOptionNone; - bool use_binary_archive = true; - if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { - use_binary_archive = (atoi(str) == 0); - } + bool use_binary_archive = should_use_binary_archive(); id<MTLBinaryArchive> archive = nil; string metalbin_path; @@ -608,19 +706,32 @@ void MetalKernelPipeline::compile() } } -bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized) +bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type) { + const double starttime = time_dt(); auto shader_cache = get_shader_cache(device->mtlDevice); for (int i = 0; i < DEVICE_KERNEL_NUM; i++) { - shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized); + shader_cache->load_kernel((DeviceKernel)i, device, pso_type); } - if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) { - shader_cache->wait_for_all(); - } + shader_cache->wait_for_all(); + metal_printf("Back-end compilation finished in %.1f seconds (%s)\n", + time_dt() - starttime, + kernel_type_as_string(pso_type)); return true; } +bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type) +{ + auto shader_cache = get_shader_cache(device->mtlDevice); + for (int i = 0; i < DEVICE_KERNEL_NUM; i++) { + if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) { + return true; + } + } + return false; +} + const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device, DeviceKernel kernel) { diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index b0bd487c86d..fc32740f3e1 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t) const override; virtual int num_concurrent_busy_states() const override; + virtual int num_sort_partition_elements() const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 0e260886abb..5ac63a16c61 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -293,6 +293,11 @@ int MetalDeviceQueue::num_concurrent_busy_states() const return result; } +int MetalDeviceQueue::num_sort_partition_elements() const +{ + return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice); +} + void MetalDeviceQueue::init_execution() { /* Synchronize all textures and memory copies before executing task. */ @@ -311,8 +316,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, return false; } - VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size " - << work_size; + VLOG_DEVICE_STATS << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size " + << work_size; id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel); @@ -358,8 +363,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */ /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */ - size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, __integrator_state) + - sizeof(IntegratorStateGPU); + size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) + + offsetof(IntegratorStateGPU, sort_partition_divisor); size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset; memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset, (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset, @@ -415,8 +420,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */ - const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) + - sizeof(IntegratorStateGPU); + const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) + + offsetof(IntegratorStateGPU, sort_partition_divisor); for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) { int pointer_index = int(offset / sizeof(device_ptr)); MetalDevice::MetalMem *mmem = *( @@ -550,7 +555,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Enhanced command buffer errors are only available in 11.0+ */ if (@available(macos 11.0, *)) { if (command_buffer.status == MTLCommandBufferStatusError && command_buffer.error != nil) { - printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]); + metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]); NSArray<id<MTLCommandBufferEncoderInfo>> *encoderInfos = [command_buffer.error.userInfo valueForKey:MTLCommandBufferEncoderInfoErrorKey]; if (encoderInfos != nil) { @@ -564,7 +569,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } } else if (command_buffer.error) { - printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]); + metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]); } } }]; diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h index f728967835d..a988d01d361 100644 --- a/intern/cycles/device/metal/util.h +++ b/intern/cycles/device/metal/util.h @@ -25,10 +25,20 @@ enum MetalGPUVendor { METAL_GPU_INTEL = 3, }; +enum AppleGPUArchitecture { + APPLE_UNKNOWN, + APPLE_M1, + APPLE_M2, +}; + /* Contains static Metal helper functions. */ struct MetalInfo { static vector<id<MTLDevice>> const &get_usable_devices(); - static MetalGPUVendor get_vendor_from_device_name(string const &device_name); + static int get_apple_gpu_core_count(id<MTLDevice> device); + static MetalGPUVendor get_device_vendor(id<MTLDevice> device); + static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device); + static int optimal_sort_partition_elements(id<MTLDevice> device); + static string get_device_name(id<MTLDevice> device); }; /* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */ diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index a6bd593bcb6..65c67c400fe 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -10,26 +10,83 @@ # include "util/string.h" # include "util/time.h" +# include <IOKit/IOKitLib.h> # include <pwd.h> # include <sys/shm.h> # include <time.h> CCL_NAMESPACE_BEGIN -MetalGPUVendor MetalInfo::get_vendor_from_device_name(string const &device_name) +string MetalInfo::get_device_name(id<MTLDevice> device) { - if (device_name.find("Intel") != string::npos) { + string device_name = [device.name UTF8String]; + if (get_device_vendor(device) == METAL_GPU_APPLE) { + /* Append the GPU core count so we can distinguish between GPU variants in benchmarks. */ + int gpu_core_count = get_apple_gpu_core_count(device); + device_name += string_printf(gpu_core_count ? " (GPU - %d cores)" : " (GPU)", gpu_core_count); + } + return device_name; +} + +int MetalInfo::get_apple_gpu_core_count(id<MTLDevice> device) +{ + int core_count = 0; + if (@available(macos 12.0, *)) { + io_service_t gpu_service = IOServiceGetMatchingService( + kIOMainPortDefault, IORegistryEntryIDMatching(device.registryID)); + if (CFNumberRef numberRef = (CFNumberRef)IORegistryEntryCreateCFProperty( + gpu_service, CFSTR("gpu-core-count"), 0, 0)) { + if (CFGetTypeID(numberRef) == CFNumberGetTypeID()) { + CFNumberGetValue(numberRef, kCFNumberSInt32Type, &core_count); + } + CFRelease(numberRef); + } + } + return core_count; +} + +AppleGPUArchitecture MetalInfo::get_apple_gpu_architecture(id<MTLDevice> device) +{ + const char *device_name = [device.name UTF8String]; + if (strstr(device_name, "M1")) { + return APPLE_M1; + } + else if (strstr(device_name, "M2")) { + return APPLE_M2; + } + return APPLE_UNKNOWN; +} + +MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device) +{ + const char *device_name = [device.name UTF8String]; + if (strstr(device_name, "Intel")) { return METAL_GPU_INTEL; } - else if (device_name.find("AMD") != string::npos) { + else if (strstr(device_name, "AMD")) { return METAL_GPU_AMD; } - else if (device_name.find("Apple") != string::npos) { + else if (strstr(device_name, "Apple")) { return METAL_GPU_APPLE; } return METAL_GPU_UNKNOWN; } +int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device) +{ + if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) { + return atoi(str); + } + + /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before + * sorting each partition by material. Partitioning into chunks of 65536 elements results in an + * overall render time speedup of up to 15%. */ + if (get_device_vendor(device) == METAL_GPU_APPLE) { + return 65536; + } + return 0; +} + vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() { static vector<id<MTLDevice>> usable_devices; @@ -41,9 +98,8 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() metal_printf("Usable Metal devices:\n"); for (id<MTLDevice> device in MTLCopyAllDevices()) { - const char *device_name = [device.name UTF8String]; - - MetalGPUVendor vendor = get_vendor_from_device_name(device_name); + string device_name = get_device_name(device); + MetalGPUVendor vendor = get_device_vendor(device); bool usable = false; if (@available(macos 12.2, *)) { @@ -55,12 +111,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() } if (usable) { - metal_printf("- %s\n", device_name); + metal_printf("- %s\n", device_name.c_str()); [device retain]; usable_devices.push_back(device); } else { - metal_printf(" (skipping \"%s\")\n", device_name); + metal_printf(" (skipping \"%s\")\n", device_name.c_str()); } } if (usable_devices.empty()) { diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp new file mode 100644 index 00000000000..8056c204188 --- /dev/null +++ b/intern/cycles/device/oneapi/device.cpp @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#include "device/oneapi/device.h" + +#include "util/log.h" + +#ifdef WITH_ONEAPI +# include "device/device.h" +# include "device/oneapi/device_impl.h" + +# include "util/path.h" +# include "util/string.h" + +# ifdef __linux__ +# include <dlfcn.h> +# endif +#endif /* WITH_ONEAPI */ + +CCL_NAMESPACE_BEGIN + +#ifdef WITH_ONEAPI +static OneAPIDLLInterface oneapi_dll; +#endif + +#ifdef _WIN32 +# define LOAD_ONEAPI_SHARED_LIBRARY(path) (void *)(LoadLibrary(path)) +# define FREE_SHARED_LIBRARY(handle) FreeLibrary((HMODULE)handle) +# define GET_SHARED_LIBRARY_SYMBOL(handle, name) GetProcAddress((HMODULE)handle, name) +#elif __linux__ +# define LOAD_ONEAPI_SHARED_LIBRARY(path) dlopen(path, RTLD_NOW) +# define FREE_SHARED_LIBRARY(handle) dlclose(handle) +# define GET_SHARED_LIBRARY_SYMBOL(handle, name) dlsym(handle, name) +#endif + +bool device_oneapi_init() +{ +#if !defined(WITH_ONEAPI) + return false; +#else + + string lib_path = path_get("lib"); +# ifdef _WIN32 + lib_path = path_join(lib_path, "cycles_kernel_oneapi.dll"); +# else + lib_path = path_join(lib_path, "cycles_kernel_oneapi.so"); +# endif + void *lib_handle = LOAD_ONEAPI_SHARED_LIBRARY(lib_path.c_str()); + + /* This shouldn't happen, but it still makes sense to have a branch for this. */ + if (lib_handle == NULL) { + LOG(ERROR) << "oneAPI kernel shared library cannot be loaded for some reason. This should not " + "happen, however, it occurs hence oneAPI rendering will be disabled"; + return false; + } + +# define DLL_INTERFACE_CALL(function, return_type, ...) \ + (oneapi_dll.function) = reinterpret_cast<decltype(oneapi_dll.function)>( \ + GET_SHARED_LIBRARY_SYMBOL(lib_handle, #function)); \ + if (oneapi_dll.function == NULL) { \ + LOG(ERROR) << "oneAPI shared library function \"" << #function \ + << "\" has not been loaded from kernel shared - disable oneAPI " \ + "library disable oneAPI implementation due to this"; \ + FREE_SHARED_LIBRARY(lib_handle); \ + return false; \ + } +# include "kernel/device/oneapi/dll_interface_template.h" +# undef DLL_INTERFACE_CALL + + VLOG_INFO << "oneAPI kernel shared library has been loaded successfully"; + + /* We need to have this oneapi kernel shared library during all life-span of the Blender. + * So it is not unloaded because of this. + * FREE_SHARED_LIBRARY(lib_handle); */ + + /* NOTE(@nsirgien): we need to enable JIT cache from here and + * right now this cache policy is controlled by env. variables. */ + /* NOTE(hallade) we also disable use of copy engine as it + * improves stability as of intel/LLVM SYCL-nightly/20220529. + * All these env variable can be set beforehand by end-users and + * will in that case -not- be overwritten. */ +# ifdef _WIN32 + if (getenv("SYCL_CACHE_PERSISTENT") == nullptr) { + _putenv_s("SYCL_CACHE_PERSISTENT", "1"); + } + if (getenv("SYCL_CACHE_TRESHOLD") == nullptr) { + _putenv_s("SYCL_CACHE_THRESHOLD", "0"); + } + if (getenv("SYCL_DEVICE_FILTER") == nullptr) { + _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero"); + } + if (getenv("SYCL_ENABLE_PCI") == nullptr) { + _putenv_s("SYCL_ENABLE_PCI", "1"); + } + if (getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE") == nullptr) { + _putenv_s("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0"); + } +# elif __linux__ + setenv("SYCL_CACHE_PERSISTENT", "1", false); + setenv("SYCL_CACHE_THRESHOLD", "0", false); + setenv("SYCL_DEVICE_FILTER", "host,level_zero", false); + setenv("SYCL_ENABLE_PCI", "1", false); + setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false); +# endif + + return true; +#endif +} + +#if defined(_WIN32) || defined(__linux__) +# undef LOAD_SYCL_SHARED_LIBRARY +# undef LOAD_ONEAPI_SHARED_LIBRARY +# undef FREE_SHARED_LIBRARY +# undef GET_SHARED_LIBRARY_SYMBOL +#endif + +Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ +#ifdef WITH_ONEAPI + return new OneapiDevice(info, oneapi_dll, stats, profiler); +#else + (void)info; + (void)stats; + (void)profiler; + + LOG(FATAL) << "Requested to create oneAPI device while not enabled for this build."; + + return nullptr; +#endif +} + +#ifdef WITH_ONEAPI +static void device_iterator_cb(const char *id, const char *name, int num, void *user_ptr) +{ + vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr; + + DeviceInfo info; + + info.type = DEVICE_ONEAPI; + info.description = name; + info.num = num; + + /* NOTE(@nsirgien): Should be unique at least on proper oneapi installation. */ + info.id = id; + + info.has_nanovdb = true; + info.denoisers = 0; + + info.has_gpu_queue = true; + + /* NOTE(@nsirgien): oneAPI right now is focused on one device usage. In future it maybe will + * change, but right now peer access from one device to another device is not supported. */ + info.has_peer_memory = false; + + /* NOTE(@nsirgien): Seems not possible to know from SYCL/oneAPI or Level0. */ + info.display_device = false; + + devices->push_back(info); + VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\"."; +} +#endif + +void device_oneapi_info(vector<DeviceInfo> &devices) +{ +#ifdef WITH_ONEAPI + (oneapi_dll.oneapi_iterate_devices)(device_iterator_cb, &devices); +#else /* WITH_ONEAPI */ + (void)devices; +#endif /* WITH_ONEAPI */ +} + +string device_oneapi_capabilities() +{ + string capabilities; +#ifdef WITH_ONEAPI + char *c_capabilities = (oneapi_dll.oneapi_device_capabilities)(); + if (c_capabilities) { + capabilities = c_capabilities; + (oneapi_dll.oneapi_free)(c_capabilities); + } +#endif + return capabilities; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/oneapi/device.h b/intern/cycles/device/oneapi/device.h new file mode 100644 index 00000000000..db8c985d4d5 --- /dev/null +++ b/intern/cycles/device/oneapi/device.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +#include "util/string.h" +#include "util/vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_oneapi_init(); + +Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_oneapi_info(vector<DeviceInfo> &devices); + +string device_oneapi_capabilities(); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp new file mode 100644 index 00000000000..dd0622a5bd5 --- /dev/null +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -0,0 +1,446 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_ONEAPI + +# include "device/oneapi/device_impl.h" + +# include "util/debug.h" +# include "util/log.h" + +# include "kernel/device/oneapi/kernel.h" + +CCL_NAMESPACE_BEGIN + +static void queue_error_cb(const char *message, void *user_ptr) +{ + if (user_ptr) { + *reinterpret_cast<std::string *>(user_ptr) = message; + } +} + +OneapiDevice::OneapiDevice(const DeviceInfo &info, + OneAPIDLLInterface &oneapi_dll_object, + Stats &stats, + Profiler &profiler) + : Device(info, stats, profiler), + device_queue_(nullptr), + texture_info_(this, "texture_info", MEM_GLOBAL), + kg_memory_(nullptr), + kg_memory_device_(nullptr), + kg_memory_size_(0), + oneapi_dll_(oneapi_dll_object) +{ + need_texture_info_ = false; + + oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_); + + /* OneAPI calls should be initialized on this moment. */ + assert(oneapi_dll_.oneapi_create_queue != nullptr); + + bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num); + if (is_finished_ok == false) { + set_error("oneAPI queue initialization error: got runtime exception \"" + + oneapi_error_string_ + "\""); + } + else { + VLOG_DEBUG << "oneAPI queue has been successfully created for the device \"" + << info.description << "\""; + assert(device_queue_); + } + + size_t globals_segment_size; + is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size); + if (is_finished_ok == false) { + set_error("oneAPI constant memory initialization got runtime exception \"" + + oneapi_error_string_ + "\""); + } + else { + VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)"; + } + + kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16); + oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size); + + kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size); + + kg_memory_size_ = globals_segment_size; + + max_memory_on_device_ = oneapi_dll_.oneapi_get_memcapacity(device_queue_); +} + +OneapiDevice::~OneapiDevice() +{ + texture_info_.free(); + oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_); + oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_); + + for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++) + delete mt->second; + + if (device_queue_) + oneapi_dll_.oneapi_free_queue(device_queue_); +} + +bool OneapiDevice::check_peer_access(Device * /*peer_device*/) +{ + return false; +} + +BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const +{ + return BVH_LAYOUT_BVH2; +} + +bool OneapiDevice::load_kernels(const uint requested_features) +{ + assert(device_queue_); + /* NOTE(@nsirgien): oneAPI can support compilation of kernel code with certain feature set + * with specialization constants, but it hasn't been implemented yet. */ + (void)requested_features; + + bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_); + if (is_finished_ok == false) { + set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\""); + } + else { + VLOG_INFO << "Runtime compilation done for \"" << info.description << "\""; + assert(device_queue_); + } + return is_finished_ok; +} + +void OneapiDevice::load_texture_info() +{ + if (need_texture_info_) { + need_texture_info_ = false; + texture_info_.copy_to_device(); + } +} + +void OneapiDevice::generic_alloc(device_memory &mem) +{ + size_t memory_size = mem.memory_size(); + + /* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then + * we can use USM host memory. + * Because of the expected performance impact, implementation of this has had a low priority + * and is not implemented yet. */ + + assert(device_queue_); + /* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device + * and shared. For new project it maybe more beneficial to use USM shared memory, because it + * provides automatic migration mechanism in order to allow to use the same pointer on host and + * on device, without need to worry about explicit memory transfer operations. But for + * Blender/Cycles this type of memory is not very suitable in current application architecture, + * because Cycles already uses two different pointer for host activity and device activity, and + * also has to perform all needed memory transfer operations. So, USM device memory + * type has been used for oneAPI device in order to better fit in Cycles architecture. */ + void *device_pointer = nullptr; + if (mem.memory_size() + stats.mem_used < max_memory_on_device_) + device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size); + if (device_pointer == nullptr) { + set_error("oneAPI kernel - device memory allocation error for " + + string_human_readable_size(mem.memory_size()) + + ", possibly caused by lack of available memory space on the device: " + + string_human_readable_size(stats.mem_used) + " of " + + string_human_readable_size(max_memory_on_device_) + " is already allocated"); + } + + mem.device_pointer = reinterpret_cast<ccl::device_ptr>(device_pointer); + mem.device_size = memory_size; + + stats.mem_alloc(memory_size); +} + +void OneapiDevice::generic_copy_to(device_memory &mem) +{ + if (!mem.device_pointer) { + return; + } + size_t memory_size = mem.memory_size(); + + /* Copy operation from host shouldn't be requested if there is no memory allocated on host. */ + assert(mem.host_pointer); + assert(device_queue_); + oneapi_dll_.oneapi_usm_memcpy( + device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size); +} + +/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */ +SyclQueue *OneapiDevice::sycl_queue() +{ + return device_queue_; +} + +string OneapiDevice::oneapi_error_message() +{ + return string(oneapi_error_string_); +} + +OneAPIDLLInterface OneapiDevice::oneapi_dll_object() +{ + return oneapi_dll_; +} + +void *OneapiDevice::kernel_globals_device_pointer() +{ + return kg_memory_device_; +} + +void OneapiDevice::generic_free(device_memory &mem) +{ + if (!mem.device_pointer) { + return; + } + + stats.mem_free(mem.device_size); + mem.device_size = 0; + + assert(device_queue_); + oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer); + mem.device_pointer = 0; +} + +void OneapiDevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_alloc: \"" << mem.name << "\", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + generic_alloc(mem); + } +} + +void OneapiDevice::mem_copy_to(device_memory &mem) +{ + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) + mem_alloc(mem); + + generic_copy_to(mem); + } +} + +void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) +{ + if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { + assert(!"mem_copy_from not supported for textures."); + } + else if (mem.host_pointer) { + const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size(); + const size_t offset = elem * y * w; + + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ") from offset " << offset + << " data " << size << " bytes"; + } + + assert(device_queue_); + + assert(size != 0); + if (mem.device_pointer) { + char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset; + char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset; + bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy( + device_queue_, shifted_host, shifted_device, size); + if (is_finished_ok == false) { + set_error("oneAPI memory operation error: got runtime exception \"" + + oneapi_error_string_ + "\""); + } + } + } +} + +void OneapiDevice::mem_zero(device_memory &mem) +{ + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_zero: \"" << mem.name << "\", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")\n"; + } + + if (!mem.device_pointer) { + mem_alloc(mem); + } + if (!mem.device_pointer) { + return; + } + + assert(device_queue_); + bool is_finished_ok = oneapi_dll_.oneapi_usm_memset( + device_queue_, (void *)mem.device_pointer, 0, mem.memory_size()); + if (is_finished_ok == false) { + set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ + + "\""); + } +} + +void OneapiDevice::mem_free(device_memory &mem) +{ + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_free: \"" << mem.name << "\", " + << string_human_readable_number(mem.device_size) << " bytes. (" + << string_human_readable_size(mem.device_size) << ")\n"; + } + + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else { + generic_free(mem); + } +} + +device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) +{ + return reinterpret_cast<device_ptr>(reinterpret_cast<char *>(mem.device_pointer) + + mem.memory_elements_size(offset)); +} + +void OneapiDevice::const_copy_to(const char *name, void *host, size_t size) +{ + assert(name); + + VLOG_DEBUG << "OneapiDevice::const_copy_to \"" << name << "\" object " + << string_human_readable_number(size) << " bytes. (" + << string_human_readable_size(size) << ")"; + + ConstMemMap::iterator i = const_mem_map_.find(name); + device_vector<uchar> *data; + + if (i == const_mem_map_.end()) { + data = new device_vector<uchar>(this, name, MEM_READ_ONLY); + data->alloc(size); + const_mem_map_.insert(ConstMemMap::value_type(name, data)); + } + else { + data = i->second; + } + + assert(data->memory_size() <= size); + memcpy(data->data(), host, size); + data->copy_to_device(); + + oneapi_dll_.oneapi_set_global_memory( + device_queue_, kg_memory_, name, (void *)data->device_pointer); + + oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); +} + +void OneapiDevice::global_alloc(device_memory &mem) +{ + assert(mem.name); + + size_t size = mem.memory_size(); + VLOG_DEBUG << "OneapiDevice::global_alloc \"" << mem.name << "\" object " + << string_human_readable_number(size) << " bytes. (" + << string_human_readable_size(size) << ")"; + + generic_alloc(mem); + generic_copy_to(mem); + + oneapi_dll_.oneapi_set_global_memory( + device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer); + + oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); +} + +void OneapiDevice::global_free(device_memory &mem) +{ + if (mem.device_pointer) { + generic_free(mem); + } +} + +void OneapiDevice::tex_alloc(device_texture &mem) +{ + generic_alloc(mem); + generic_copy_to(mem); + + /* Resize if needed. Also, in case of resize - allocate in advance for future allocs. */ + const uint slot = mem.slot; + if (slot >= texture_info_.size()) { + texture_info_.resize(slot + 128); + } + + texture_info_[slot] = mem.info; + need_texture_info_ = true; + + texture_info_[slot].data = (uint64_t)mem.device_pointer; +} + +void OneapiDevice::tex_free(device_texture &mem) +{ + /* There is no texture memory in SYCL. */ + if (mem.device_pointer) { + generic_free(mem); + } +} + +unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create() +{ + return make_unique<OneapiDeviceQueue>(this); +} + +int OneapiDevice::get_num_multiprocessors() +{ + assert(device_queue_); + return oneapi_dll_.oneapi_get_num_multiprocessors(device_queue_); +} + +int OneapiDevice::get_max_num_threads_per_multiprocessor() +{ + assert(device_queue_); + return oneapi_dll_.oneapi_get_max_num_threads_per_multiprocessor(device_queue_); +} + +bool OneapiDevice::should_use_graphics_interop() +{ + /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so + * return false. */ + return false; +} + +void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment) +{ + assert(device_queue_); + return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment); +} + +void OneapiDevice::usm_free(void *usm_ptr) +{ + assert(device_queue_); + return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h new file mode 100644 index 00000000000..6abebf98684 --- /dev/null +++ b/intern/cycles/device/oneapi/device_impl.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_ONEAPI + +# include "device/device.h" +# include "device/oneapi/device.h" +# include "device/oneapi/queue.h" + +# include "util/map.h" + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +class OneapiDevice : public Device { + private: + SyclQueue *device_queue_; + + using ConstMemMap = map<string, device_vector<uchar> *>; + ConstMemMap const_mem_map_; + device_vector<TextureInfo> texture_info_; + bool need_texture_info_; + void *kg_memory_; + void *kg_memory_device_; + size_t kg_memory_size_ = (size_t)0; + size_t max_memory_on_device_ = (size_t)0; + OneAPIDLLInterface oneapi_dll_; + std::string oneapi_error_string_; + + public: + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + OneapiDevice(const DeviceInfo &info, + OneAPIDLLInterface &oneapi_dll_object, + Stats &stats, + Profiler &profiler); + + virtual ~OneapiDevice(); + + bool check_peer_access(Device *peer_device) override; + + bool load_kernels(const uint requested_features) override; + + void load_texture_info(); + + void generic_alloc(device_memory &mem); + + void generic_copy_to(device_memory &mem); + + void generic_free(device_memory &mem); + + SyclQueue *sycl_queue(); + + string oneapi_error_message(); + + OneAPIDLLInterface oneapi_dll_object(); + + void *kernel_globals_device_pointer(); + + void mem_alloc(device_memory &mem) override; + + void mem_copy_to(device_memory &mem) override; + + void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override; + + void mem_copy_from(device_memory &mem) + { + mem_copy_from(mem, 0, 0, 0, 0); + } + + void mem_zero(device_memory &mem) override; + + void mem_free(device_memory &mem) override; + + device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + + void tex_free(device_texture &mem); + + /* Graphics resources interoperability. */ + virtual bool should_use_graphics_interop() override; + + virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + + int get_num_multiprocessors(); + int get_max_num_threads_per_multiprocessor(); + + /* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host + * side compilation (MSVC). */ + void *usm_aligned_alloc_host(size_t memory_size, size_t alignment); + void usm_free(void *usm_ptr); +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/oneapi/dll_interface.h b/intern/cycles/device/oneapi/dll_interface.h new file mode 100644 index 00000000000..0a888194e98 --- /dev/null +++ b/intern/cycles/device/oneapi/dll_interface.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +/* Include kernel header to get access to SYCL-specific types, like SyclQueue and + * OneAPIDeviceIteratorCallback. */ +#include "kernel/device/oneapi/kernel.h" + +#ifdef WITH_ONEAPI +struct OneAPIDLLInterface { +# define DLL_INTERFACE_CALL(function, return_type, ...) \ + return_type (*function)(__VA_ARGS__) = nullptr; +# include "kernel/device/oneapi/dll_interface_template.h" +# undef DLL_INTERFACE_CALL +}; +#endif diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp new file mode 100644 index 00000000000..1e822e25f1a --- /dev/null +++ b/intern/cycles/device/oneapi/queue.cpp @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_ONEAPI + +# include "device/oneapi/queue.h" +# include "device/oneapi/device_impl.h" +# include "util/log.h" +# include "util/time.h" +# include <iomanip> +# include <vector> + +# include "kernel/device/oneapi/kernel.h" + +CCL_NAMESPACE_BEGIN + +struct KernelExecutionInfo { + double elapsed_summary = 0.0; + int enqueue_count = 0; +}; + +/* OneapiDeviceQueue */ + +OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device) + : DeviceQueue(device), + oneapi_device_(device), + oneapi_dll_(device->oneapi_dll_object()), + kernel_context_(nullptr) +{ +} + +OneapiDeviceQueue::~OneapiDeviceQueue() +{ + delete kernel_context_; +} + +int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const +{ + const int max_num_threads = oneapi_device_->get_num_multiprocessors() * + oneapi_device_->get_max_num_threads_per_multiprocessor(); + int num_states = max(8 * max_num_threads, 65536) * 16; + + VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to " + << string_human_readable_size(num_states * state_size); + + return num_states; +} + +int OneapiDeviceQueue::num_concurrent_busy_states() const +{ + const int max_num_threads = oneapi_device_->get_num_multiprocessors() * + oneapi_device_->get_max_num_threads_per_multiprocessor(); + + return 4 * max(8 * max_num_threads, 65536); +} + +void OneapiDeviceQueue::init_execution() +{ + oneapi_device_->load_texture_info(); + + SyclQueue *device_queue = oneapi_device_->sycl_queue(); + void *kg_dptr = (void *)oneapi_device_->kernel_globals_device_pointer(); + assert(device_queue); + assert(kg_dptr); + kernel_context_ = new KernelContext{device_queue, kg_dptr}; + + debug_init_execution(); +} + +bool OneapiDeviceQueue::enqueue(DeviceKernel kernel, + const int signed_kernel_work_size, + DeviceKernelArguments const &_args) +{ + if (oneapi_device_->have_error()) { + return false; + } + + void **args = const_cast<void **>(_args.values); + + debug_enqueue(kernel, signed_kernel_work_size); + assert(signed_kernel_work_size >= 0); + size_t kernel_work_size = (size_t)signed_kernel_work_size; + + size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size( + kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size); + size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size); + + assert(kernel_context_); + + /* Call the oneAPI kernel DLL to launch the requested kernel. */ + bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel( + kernel_context_, kernel, uniformed_kernel_work_size, args); + + if (is_finished_ok == false) { + oneapi_device_->set_error("oneAPI kernel \"" + std::string(device_kernel_as_string(kernel)) + + "\" execution error: got runtime exception \"" + + oneapi_device_->oneapi_error_message() + "\""); + } + + return is_finished_ok; +} + +bool OneapiDeviceQueue::synchronize() +{ + if (oneapi_device_->have_error()) { + return false; + } + + bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue()); + if (is_finished_ok == false) + oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" + + oneapi_device_->oneapi_error_message() + "\""); + + debug_synchronize(); + + return !(oneapi_device_->have_error()); +} + +void OneapiDeviceQueue::zero_to_device(device_memory &mem) +{ + oneapi_device_->mem_zero(mem); +} + +void OneapiDeviceQueue::copy_to_device(device_memory &mem) +{ + oneapi_device_->mem_copy_to(mem); +} + +void OneapiDeviceQueue::copy_from_device(device_memory &mem) +{ + oneapi_device_->mem_copy_from(mem); +} + +CCL_NAMESPACE_END + +#endif /* WITH_ONEAPI */ diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h new file mode 100644 index 00000000000..716cbfdc88c --- /dev/null +++ b/intern/cycles/device/oneapi/queue.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +#ifdef WITH_ONEAPI + +# include "device/kernel.h" +# include "device/memory.h" +# include "device/queue.h" + +# include "device/oneapi/device.h" +# include "device/oneapi/dll_interface.h" + +CCL_NAMESPACE_BEGIN + +class OneapiDevice; +class device_memory; + +/* Base class for OneAPI queues. */ +class OneapiDeviceQueue : public DeviceQueue { + public: + explicit OneapiDeviceQueue(OneapiDevice *device); + ~OneapiDeviceQueue(); + + virtual int num_concurrent_states(const size_t state_size) const override; + + virtual int num_concurrent_busy_states() const override; + + virtual void init_execution() override; + + virtual bool enqueue(DeviceKernel kernel, + const int kernel_work_size, + DeviceKernelArguments const &args) override; + + virtual bool synchronize() override; + + virtual void zero_to_device(device_memory &mem) override; + virtual void copy_to_device(device_memory &mem) override; + virtual void copy_from_device(device_memory &mem) override; + + protected: + OneapiDevice *oneapi_device_; + OneAPIDLLInterface oneapi_dll_; + KernelContext *kernel_context_; + bool with_kernel_statistics_; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_ONEAPI */ diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp index 70810bae10d..68ca21374fd 100644 --- a/intern/cycles/device/optix/device.cpp +++ b/intern/cycles/device/optix/device.cpp @@ -31,12 +31,12 @@ bool device_optix_init() const OptixResult result = optixInit(); if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) { - VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. " - "Please update to the latest driver first!"; + VLOG_WARNING << "OptiX initialization failed because the installed NVIDIA driver is too old. " + "Please update to the latest driver first!"; return false; } else if (result != OPTIX_SUCCESS) { - VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result; + VLOG_WARNING << "OptiX initialization failed with error code " << (unsigned int)result; return false; } diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index 9ab9bbb59c5..6c64e7106d5 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -26,7 +26,6 @@ # include "util/task.h" # include "util/time.h" -# undef __KERNEL_CPU__ # define __KERNEL_OPTIX__ # include "kernel/device/optix/globals.h" @@ -40,6 +39,9 @@ CCL_NAMESPACE_BEGIN // The original code is Copyright NVIDIA Corporation, BSD-3-Clause. namespace { +# if OPTIX_ABI_VERSION >= 60 +using ::optixUtilDenoiserInvokeTiled; +# else static OptixResult optixUtilDenoiserSplitImage(const OptixImage2D &input, const OptixImage2D &output, unsigned int overlapWindowSizeInPixels, @@ -216,6 +218,7 @@ static OptixResult optixUtilDenoiserInvokeTiled(OptixDenoiser denoiser, } return OPTIX_SUCCESS; } +# endif # if OPTIX_ABI_VERSION >= 55 static void execute_optix_task(TaskPool &pool, OptixTask task, OptixResult &failure_reason) @@ -246,7 +249,7 @@ OptiXDevice::Denoiser::Denoiser(OptiXDevice *device) OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) : CUDADevice(info, stats, profiler), sbt_data(this, "__sbt", MEM_READ_ONLY), - launch_params(this, "__params", false), + launch_params(this, "kernel_params", false), denoiser_(this) { /* Make the CUDA context current. */ @@ -278,7 +281,7 @@ OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profile }; # endif if (DebugFlags().optix.use_debug) { - VLOG(1) << "Using OptiX debug mode."; + VLOG_INFO << "Using OptiX debug mode."; options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL; } optix_assert(optixDeviceContextCreate(cuContext, &options, &context)); @@ -339,15 +342,29 @@ BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const return BVH_LAYOUT_OPTIX; } +static string get_optix_include_dir() +{ + const char *env_dir = getenv("OPTIX_ROOT_DIR"); + const char *default_dir = CYCLES_RUNTIME_OPTIX_ROOT_DIR; + + if (env_dir && env_dir[0]) { + const string env_include_dir = path_join(env_dir, "include"); + return env_include_dir; + } + else if (default_dir[0]) { + const string default_include_dir = path_join(default_dir, "include"); + return default_include_dir; + } + + return string(); +} + string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features) { string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features); /* Add OptiX SDK include directory to include paths. */ - const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR"); - if (optix_sdk_path) { - common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path); - } + common_cflags += string_printf(" -I\"%s\"", get_optix_include_dir().c_str()); /* Specialization for shader raytracing. */ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { @@ -421,7 +438,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_options.numPayloadValues = 8; pipeline_options.numAttributeValues = 2; /* u, v */ pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE; - pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */ + pipeline_options.pipelineLaunchParamsVariableName = "kernel_params"; /* See globals.h */ pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; if (kernel_features & KERNEL_FEATURE_HAIR) { @@ -457,10 +474,19 @@ bool OptiXDevice::load_kernels(const uint kernel_features) "lib/kernel_optix_shader_raytrace.ptx" : "lib/kernel_optix.ptx"); if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { - if (!getenv("OPTIX_ROOT_DIR")) { + std::string optix_include_dir = get_optix_include_dir(); + if (optix_include_dir.empty()) { set_error( - "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to " - "the Optix SDK to be able to compile Optix kernels on demand)."); + "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable " + "to a directory containing the OptiX SDK."); + return false; + } + else if (!path_is_directory(optix_include_dir)) { + set_error(string_printf( + "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install " + "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a " + "directory containing the OptiX SDK.", + optix_include_dir.c_str())); return false; } ptx_filename = compile_kernel( @@ -1390,13 +1416,13 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh, options.operation = operation; if (use_fast_trace_bvh || /* The build flags have to match the ones used to query the built-in curve intersection - program (see optixBuiltinISModuleGet above) */ + * program (see optixBuiltinISModuleGet above) */ build_input.type == OPTIX_BUILD_INPUT_TYPE_CURVES) { - VLOG(2) << "Using fast to trace OptiX BVH"; + VLOG_INFO << "Using fast to trace OptiX BVH"; options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION; } else { - VLOG(2) << "Using fast to update OptiX BVH"; + VLOG_INFO << "Using fast to update OptiX BVH"; options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE; } @@ -2042,26 +2068,26 @@ void OptiXDevice::const_copy_to(const char *name, void *host, size_t size) /* Set constant memory for CUDA module. */ CUDADevice::const_copy_to(name, host, size); - if (strcmp(name, "__data") == 0) { + if (strcmp(name, "data") == 0) { assert(size <= sizeof(KernelData)); /* Update traversable handle (since it is different for each device on multi devices). */ KernelData *const data = (KernelData *)host; - *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; + *(OptixTraversableHandle *)&data->device_bvh = tlas_handle; update_launch_params(offsetof(KernelParamsOptiX, data), host, size); return; } /* Update data storage pointers in launch parameters. */ -# define KERNEL_TEX(data_type, tex_name) \ - if (strcmp(name, #tex_name) == 0) { \ - update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \ +# define KERNEL_DATA_ARRAY(data_type, data_name) \ + if (strcmp(name, #data_name) == 0) { \ + update_launch_params(offsetof(KernelParamsOptiX, data_name), host, size); \ return; \ } - KERNEL_TEX(IntegratorStateGPU, __integrator_state) -# include "kernel/textures.h" -# undef KERNEL_TEX + KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state) +# include "kernel/data_arrays.h" +# undef KERNEL_DATA_ARRAY } void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size) diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp index 366bf95269d..f0d49ad6f6c 100644 --- a/intern/cycles/device/optix/queue.cpp +++ b/intern/cycles/device/optix/queue.cpp @@ -8,7 +8,6 @@ # include "util/time.h" -# undef __KERNEL_CPU__ # define __KERNEL_OPTIX__ # include "kernel/device/optix/globals.h" diff --git a/intern/cycles/device/queue.cpp b/intern/cycles/device/queue.cpp index de65047ed6a..cc0cf0ccf84 100644 --- a/intern/cycles/device/queue.cpp +++ b/intern/cycles/device/queue.cpp @@ -19,7 +19,7 @@ DeviceQueue::DeviceQueue(Device *device) DeviceQueue::~DeviceQueue() { - if (VLOG_IS_ON(3)) { + if (VLOG_DEVICE_STATS_IS_ON) { /* Print kernel execution times sorted by time. */ vector<pair<DeviceKernelMask, double>> stats_sorted; for (const auto &stat : stats_kernel_time_) { @@ -32,17 +32,18 @@ DeviceQueue::~DeviceQueue() return a.second > b.second; }); - VLOG(3) << "GPU queue stats:"; + VLOG_DEVICE_STATS << "GPU queue stats:"; for (const auto &[mask, time] : stats_sorted) { - VLOG(3) << " " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5) - << std::right << time << "s: " << device_kernel_mask_as_string(mask); + VLOG_DEVICE_STATS << " " << std::setfill(' ') << std::setw(10) << std::fixed + << std::setprecision(5) << std::right << time + << "s: " << device_kernel_mask_as_string(mask); } } } void DeviceQueue::debug_init_execution() { - if (VLOG_IS_ON(3)) { + if (VLOG_DEVICE_STATS_IS_ON) { last_sync_time_ = time_dt(); } @@ -51,9 +52,9 @@ void DeviceQueue::debug_init_execution() void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size) { - if (VLOG_IS_ON(3)) { - VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size " - << work_size; + if (VLOG_DEVICE_STATS_IS_ON) { + VLOG_DEVICE_STATS << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size " + << work_size; } last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel); @@ -61,10 +62,10 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size) void DeviceQueue::debug_synchronize() { - if (VLOG_IS_ON(3)) { + if (VLOG_DEVICE_STATS_IS_ON) { const double new_time = time_dt(); const double elapsed_time = new_time - last_sync_time_; - VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s"; + VLOG_DEVICE_STATS << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s"; stats_kernel_time_[last_kernels_enqueued_] += elapsed_time; diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 14a5db3a204..808431af401 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -105,6 +105,13 @@ class DeviceQueue { * value. */ virtual int num_concurrent_busy_states() const = 0; + /* Number of elements in a partition of sorted shaders, that improves memory locality of + * integrator state fetch at the cost of decreased coherence for shader kernel execution. */ + virtual int num_sort_partition_elements() const + { + return 65536; + } + /* Initialize execution of kernels on this queue. * * Will, for example, load all data required by the kernels from Device to global or path state. |