diff options
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/CMakeLists.txt | 22 | ||||
-rw-r--r-- | intern/cycles/device/cuda/queue.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/device/cuda/queue.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/hip/queue.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/device/hip/queue.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/metal/device_impl.mm | 10 | ||||
-rw-r--r-- | intern/cycles/device/metal/kernel.mm | 13 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.mm | 51 | ||||
-rw-r--r-- | intern/cycles/device/metal/util.mm | 6 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/device.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/device_impl.cpp | 69 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/device_impl.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/queue.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/queue.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/queue.h | 2 |
16 files changed, 109 insertions, 84 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 5516e97f34f..5296d819e42 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -187,18 +187,22 @@ if(WITH_CYCLES_DEVICE_METAL) ) endif() if (WITH_CYCLES_DEVICE_ONEAPI) + if(WITH_CYCLES_ONEAPI_BINARIES) + set(cycles_kernel_oneapi_lib_suffix "_aot") + else() + set(cycles_kernel_oneapi_lib_suffix "_jit") + endif() if(WIN32) - set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi.lib) + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.lib) else() - set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi.so) + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.so) + endif() + list(APPEND LIB ${cycles_kernel_oneapi_lib}) + if(WIN32) + list(APPEND LIB debug ${SYCL_LIBRARY_DEBUG} optimized ${SYCL_LIBRARY}) + else() + list(APPEND LIB ${SYCL_LIBRARY}) endif() - list(APPEND LIB - ${cycles_kernel_oneapi_lib} - "$<$<CONFIG:Debug>:${SYCL_LIBRARY_DEBUG}>" - "$<$<CONFIG:Release>:${SYCL_LIBRARY}>" - "$<$<CONFIG:RelWithDebInfo>:${SYCL_LIBRARY}>" - "$<$<CONFIG:MinSizeRel>:${SYCL_LIBRARY}>" - ) add_definitions(-DWITH_ONEAPI) list(APPEND SRC ${SRC_ONEAPI} diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index 84b0a1e0dd6..69fae03e32c 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int CUDADeviceQueue::num_concurrent_busy_states() const +int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = cuda_device_->get_num_multiprocessors() * cuda_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h index b450f5b3592..7107afe70c9 100644 --- a/intern/cycles/device/cuda/queue.h +++ b/intern/cycles/device/cuda/queue.h @@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue { ~CUDADeviceQueue(); virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index 3f8b6267100..e93a9b4df3a 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int HIPDeviceQueue::num_concurrent_busy_states() const +int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = hip_device_->get_num_multiprocessors() * hip_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h index 729d8a19acb..df0678108af 100644 --- a/intern/cycles/device/hip/queue.h +++ b/intern/cycles/device/hip/queue.h @@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue { ~HIPDeviceQueue(); virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 6a16d4bb3b4..6f1042b1e55 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -296,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat } source = global_defines + source; +# if 0 metal_printf("================\n%s================\n\%s================\n", global_defines.c_str(), baked_constants.c_str()); +# endif /* Generate an MD5 from the source and include any baked constants. This is used when caching * PSOs. */ @@ -339,6 +341,14 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type) MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; +# if defined(MAC_OS_VERSION_13_0) + if (@available(macos 13.0, *)) { + if (device_vendor == METAL_GPU_INTEL) { + [options setOptimizationLevel:MTLLibraryOptimizationLevelSize]; + } + } +# endif + options.fastMathEnabled = YES; if (@available(macOS 12.0, *)) { options.languageVersion = MTLLanguageVersion2_4; diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 5e0cb6d18f4..55938d1a03a 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -162,6 +162,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel, } } + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) { + if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) { + /* Skip shade_surface_mnee kernel if the scene doesn't require it. */ + return false; + } + } + if (pso_type != PSO_GENERIC) { /* Only specialize kernels where it can make an impact. */ if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || @@ -317,6 +324,12 @@ bool MetalKernelPipeline::should_use_binary_archive() const } } + /* Workaround for Intel GPU having issue using Binary Archives */ + MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice); + if (gpu_vendor == METAL_GPU_INTEL) { + return false; + } + if (pso_type == PSO_GENERIC) { /* Archive the generic kernels. */ return true; diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index fc32740f3e1..2a6c12e2a60 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue { ~MetalDeviceQueue(); virtual int num_concurrent_states(const size_t) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t) const override; virtual int num_sort_partition_elements() const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 5ac63a16c61..c0df2c8553f 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue() } } -int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const { - /* METAL_WIP */ - /* TODO: compute automatically. */ - /* TODO: must have at least num_threads_per_block. */ - int result = 1048576; - if (metal_device_->device_vendor == METAL_GPU_AMD) { - result *= 2; + static int result = 0; + if (result) { + return result; } - else if (metal_device_->device_vendor == METAL_GPU_APPLE) { + + result = 1048576; + if (metal_device_->device_vendor == METAL_GPU_APPLE) { result *= 4; + + if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) { + size_t system_ram = system_physical_ram(); + size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize]; + size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize]; + + /* Determine whether we can double the state count, and leave enough GPU-available memory + * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to + * keep dispatch sizes high and minimize work submission overheads. */ + size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024)); + size_t total_state_size = result * state_size; + if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) { + result *= 2; + metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result); + } + } + } + else if (metal_device_->device_vendor == METAL_GPU_AMD) { + /* METAL_WIP */ + /* TODO: compute automatically. */ + /* TODO: must have at least num_threads_per_block. */ + result *= 2; } return result; } -int MetalDeviceQueue::num_concurrent_busy_states() const +int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const { - /* METAL_WIP */ - /* TODO: compute automatically. */ - int result = 65536; - if (metal_device_->device_vendor == METAL_GPU_AMD) { - result *= 2; - } - else if (metal_device_->device_vendor == METAL_GPU_APPLE) { - result *= 4; - } - return result; + /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */ + return num_concurrent_states(state_size) / 4; } int MetalDeviceQueue::num_sort_partition_elements() const diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index 65c67c400fe..f47638fac15 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -110,6 +110,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() usable |= (vendor == METAL_GPU_AMD); } +# if defined(MAC_OS_VERSION_13_0) + if (@available(macos 13.0, *)) { + usable |= (vendor == METAL_GPU_INTEL); + } +# endif + if (usable) { metal_printf("- %s\n", device_name.c_str()); [device retain]; diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp index f303ab41627..66d6f749e30 100644 --- a/intern/cycles/device/oneapi/device.cpp +++ b/intern/cycles/device/oneapi/device.cpp @@ -39,7 +39,7 @@ bool device_oneapi_init() _putenv_s("SYCL_CACHE_THRESHOLD", "0"); } if (getenv("SYCL_DEVICE_FILTER") == nullptr) { - _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero"); + _putenv_s("SYCL_DEVICE_FILTER", "level_zero"); } if (getenv("SYCL_ENABLE_PCI") == nullptr) { _putenv_s("SYCL_ENABLE_PCI", "1"); @@ -50,7 +50,7 @@ bool device_oneapi_init() # elif __linux__ setenv("SYCL_CACHE_PERSISTENT", "1", false); setenv("SYCL_CACHE_THRESHOLD", "0", false); - setenv("SYCL_DEVICE_FILTER", "host,level_zero", false); + setenv("SYCL_DEVICE_FILTER", "level_zero", false); setenv("SYCL_ENABLE_PCI", "1", false); setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false); # endif diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index 91f53fd1eae..d0ddd69289c 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -430,9 +430,9 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_ sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); (void)usm_type; assert(usm_type == sycl::usm::alloc::device || - ((device_type == sycl::info::device_type::host || - device_type == sycl::info::device_type::cpu || allow_host) && - usm_type == sycl::usm::alloc::host)); + ((device_type == sycl::info::device_type::cpu || allow_host) && + usm_type == sycl::usm::alloc::host || + usm_type == sycl::usm::alloc::unknown)); # else /* Silence warning about unused arguments. */ (void)queue_; @@ -668,16 +668,9 @@ int OneapiDevice::parse_driver_build_version(const sycl::device &device) std::vector<sycl::device> OneapiDevice::available_devices() { bool allow_all_devices = false; - if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) + if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) { allow_all_devices = true; - - /* Host device is useful only for debugging at the moment - * so we hide this device with default build settings. */ -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - bool allow_host = true; -# else - bool allow_host = false; -# endif + } const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms(); @@ -690,33 +683,28 @@ std::vector<sycl::device> OneapiDevice::available_devices() } const std::vector<sycl::device> &oneapi_devices = - (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) : - platform.get_devices(sycl::info::device_type::gpu); + (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) : + platform.get_devices(sycl::info::device_type::gpu); for (const sycl::device &device : oneapi_devices) { - if (allow_all_devices) { - /* still filter out host device if build doesn't support it. */ - if (allow_host || !device.is_host()) { - available_devices.push_back(device); - } - } - else { - bool filter_out = false; - + bool filter_out = false; + if (!allow_all_devices) { /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU, * assuming they have either more than 96 Execution Units or not 7 threads per EU. * Official support can be broaden to older and smaller GPUs once ready. */ - if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) { - /* Filtered-out defaults in-case these values aren't available through too old L0 - * runtime. */ + if (!device.is_gpu() || platform.get_backend() != sycl::backend::ext_oneapi_level_zero) { + filter_out = true; + } + else { + /* Filtered-out defaults in-case these values aren't available. */ int number_of_eus = 96; int threads_per_eu = 7; if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { - number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); + number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>(); } if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { threads_per_eu = - device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); + device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>(); } /* This filters out all Level-Zero supported GPUs from older generation than Arc. */ if (number_of_eus <= 96 && threads_per_eu == 7) { @@ -732,16 +720,9 @@ std::vector<sycl::device> OneapiDevice::available_devices() } } } - else if (!allow_host && device.is_host()) { - filter_out = true; - } - else if (!allow_all_devices) { - filter_out = true; - } - - if (!filter_out) { - available_devices.push_back(device); - } + } + if (!filter_out) { + available_devices.push_back(device); } } } @@ -797,9 +778,7 @@ char *OneapiDevice::device_capabilities() GET_NUM_ATTR(native_vector_width_double) GET_NUM_ATTR(native_vector_width_half) - size_t max_clock_frequency = - (size_t)(device.is_host() ? (size_t)0 : - device.get_info<sycl::info::device::max_clock_frequency>()); + size_t max_clock_frequency = device.get_info<sycl::info::device::max_clock_frequency>(); WRITE_ATTR("max_clock_frequency", max_clock_frequency) GET_NUM_ATTR(address_bits) @@ -837,7 +816,7 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p std::string name = device.get_info<sycl::info::device::name>(); std::string id = "ONEAPI_" + platform_name + "_" + name; if (device.has(sycl::aspect::ext_intel_pci_address)) { - id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>()); + id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>()); } (cb)(id.c_str(), name.c_str(), num, user_ptr); num++; @@ -855,7 +834,7 @@ int OneapiDevice::get_num_multiprocessors() { const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device(); if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { - return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); + return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>(); } else return 0; @@ -866,8 +845,8 @@ int OneapiDevice::get_max_num_threads_per_multiprocessor() const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device(); if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) && device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { - return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() * - device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); + return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() * + device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>(); } else return 0; diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h index 62034150eac..197cf03d60d 100644 --- a/intern/cycles/device/oneapi/device_impl.h +++ b/intern/cycles/device/oneapi/device_impl.h @@ -3,7 +3,7 @@ #ifdef WITH_ONEAPI -# include <CL/sycl.hpp> +# include <sycl/sycl.hpp> # include "device/device.h" # include "device/oneapi/device.h" diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp index 9632b14d485..3d019661aa8 100644 --- a/intern/cycles/device/oneapi/queue.cpp +++ b/intern/cycles/device/oneapi/queue.cpp @@ -43,7 +43,7 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int OneapiDeviceQueue::num_concurrent_busy_states() const +int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = oneapi_device_->get_num_multiprocessors() * oneapi_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h index 32363bf2a6e..bbd947b49cb 100644 --- a/intern/cycles/device/oneapi/queue.h +++ b/intern/cycles/device/oneapi/queue.h @@ -25,7 +25,7 @@ class OneapiDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 1d6a8d736b7..e27e081a407 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -103,7 +103,7 @@ class DeviceQueue { /* Number of states which keeps the device occupied with work without losing performance. * The renderer will add more work (when available) when number of active paths falls below this * value. */ - virtual int num_concurrent_busy_states() const = 0; + virtual int num_concurrent_busy_states(const size_t state_size) const = 0; /* Number of elements in a partition of sorted shaders, that improves memory locality of * integrator state fetch at the cost of decreased coherence for shader kernel execution. */ |