diff options
Diffstat (limited to 'intern/cycles')
50 files changed, 508 insertions, 439 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index f619e6b104e..329aa3990f6 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -263,8 +263,7 @@ if(WITH_CYCLES_DEVICE_OPTIX) ${OPTIX_INCLUDE_DIR} ) else() - message(STATUS "OptiX not found, disabling it from Cycles") - set(WITH_CYCLES_DEVICE_OPTIX OFF) + set_and_warn_library_found("OptiX" OPTIX_FOUND WITH_CYCLES_DEVICE_OPTIX) endif() endif() @@ -387,8 +386,7 @@ if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_C_COMPILER_ID MATCHES "Clang") endif() if(WITH_CYCLES_HYDRA_RENDER_DELEGATE AND (NOT WITH_USD)) - message(STATUS "USD not found, disabling WITH_CYCLES_HYDRA_RENDER_DELEGATE") - set(WITH_CYCLES_HYDRA_RENDER_DELEGATE OFF) + set_and_warn_library_found("USD" WITH_USD WITH_CYCLES_HYDRA_RENDER_DELEGATE) endif() if(WITH_CYCLES_HYDRA_RENDER_DELEGATE AND (NOT WITH_BLENDER) AND (NOT WITH_CYCLES_STANDALONE)) set(CYCLES_INSTALL_PATH ${CYCLES_INSTALL_PATH}/hdCycles/resources) diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt index 0988b1c0ac4..1c7a861ea93 100644 --- a/intern/cycles/app/CMakeLists.txt +++ b/intern/cycles/app/CMakeLists.txt @@ -43,7 +43,10 @@ else() endif() if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI) - list(APPEND INC_SYS ${Epoxy_INCLUDE_DIRS} ${SDL2_INCLUDE_DIRS}) + list(APPEND INC_SYS + ${Epoxy_INCLUDE_DIRS} + ${SDL2_INCLUDE_DIRS} + ) list(APPEND LIB ${Epoxy_LIBRARIES} ${SDL2_LIBRARIES}) endif() diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 794338fe78e..e33891fa7a2 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -209,22 +209,25 @@ def list_render_passes(scene, srl): yield ("Debug Sample Count", "X", 'VALUE') # Cryptomatte passes. - crypto_depth = (srl.pass_cryptomatte_depth + 1) // 2 + # NOTE: Name channels are lowercase RGBA so that compression rules check in OpenEXR DWA code + # uses lossless compression. Reportedly this naming is the only one which works good from the + # interoperability point of view. Using XYZW naming is not portable. + crypto_depth = (min(16, srl.pass_cryptomatte_depth) + 1) // 2 if srl.use_pass_cryptomatte_object: for i in range(0, crypto_depth): - yield ("CryptoObject" + '{:02d}'.format(i), "RGBA", 'COLOR') + yield ("CryptoObject" + '{:02d}'.format(i), "rgba", 'COLOR') if srl.use_pass_cryptomatte_material: for i in range(0, crypto_depth): - yield ("CryptoMaterial" + '{:02d}'.format(i), "RGBA", 'COLOR') + yield ("CryptoMaterial" + '{:02d}'.format(i), "rgba", 'COLOR') if srl.use_pass_cryptomatte_asset: for i in range(0, crypto_depth): - yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR') + yield ("CryptoAsset" + '{:02d}'.format(i), "rgba", 'COLOR') # Denoising passes. if scene.cycles.use_denoising and crl.use_denoising: yield ("Noisy Image", "RGBA", 'COLOR') if crl.use_pass_shadow_catcher: - yield ("Noisy Shadow Catcher", "RGBA", 'COLOR') + yield ("Noisy Shadow Catcher", "RGB", 'COLOR') if crl.denoising_store_passes: yield ("Denoising Normal", "XYZ", 'VECTOR') yield ("Denoising Albedo", "RGB", 'COLOR') @@ -232,6 +235,8 @@ def list_render_passes(scene, srl): # Custom AOV passes. for aov in srl.aovs: + if not aov.is_valid: + continue if aov.type == 'VALUE': yield (aov.name, "X", 'VALUE') else: diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index b7ce76d8f44..425d123e9e6 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1651,6 +1651,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): box.prop( device, "use", text=device.name .replace('(TM)', unicodedata.lookup('TRADE MARK SIGN')) + .replace('(tm)', unicodedata.lookup('TRADE MARK SIGN')) .replace('(R)', unicodedata.lookup('REGISTERED SIGN')) .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN')) ) diff --git a/intern/cycles/blender/pointcloud.cpp b/intern/cycles/blender/pointcloud.cpp index 35be2916e43..a679a92d997 100644 --- a/intern/cycles/blender/pointcloud.cpp +++ b/intern/cycles/blender/pointcloud.cpp @@ -194,7 +194,7 @@ static void export_pointcloud(Scene *scene, /* Export points. */ for (int i = 0; i < num_points; i++) { const float3 co = get_float3(b_attr_position.data[i].vector()); - const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.0f; + const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.01f; pointcloud->add_point(co, radius); /* Random number per point. */ @@ -232,7 +232,7 @@ static void export_pointcloud_motion(PointCloud *pointcloud, for (int i = 0; i < std::min(num_points, b_points_num); i++) { const float3 co = get_float3(b_attr_position.data[i].vector()); - const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.0f; + const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.01f; float3 P = co; P.w = radius; mP[i] = P; diff --git a/intern/cycles/blender/shader.cpp b/intern/cycles/blender/shader.cpp index fd32e7ca1d7..dbc49df7f22 100644 --- a/intern/cycles/blender/shader.cpp +++ b/intern/cycles/blender/shader.cpp @@ -215,7 +215,9 @@ static void set_default_value(ShaderInput *input, } case SocketType::INT: { if (b_sock.type() == BL::NodeSocket::type_BOOLEAN) { - node->set(socket, get_boolean(b_sock.ptr, "default_value")); + /* Make sure to call the int overload of set() since this is an integer socket as far as + * Cycles is concerned. */ + node->set(socket, get_boolean(b_sock.ptr, "default_value") ? 1 : 0); } else { node->set(socket, get_int(b_sock.ptr, "default_value")); diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp index a69a94614d3..5251f0fee9c 100644 --- a/intern/cycles/blender/sync.cpp +++ b/intern/cycles/blender/sync.cpp @@ -575,68 +575,72 @@ void BlenderSync::sync_images() /* Passes */ -static PassType get_blender_pass_type(BL::RenderPass &b_pass) +static bool get_known_pass_type(BL::RenderPass &b_pass, PassType &type, PassMode &mode) { string name = b_pass.name(); -#define MAP_PASS(passname, passtype) \ +#define MAP_PASS(passname, passtype, noisy) \ if (name == passname) { \ - return passtype; \ + type = passtype; \ + mode = (noisy) ? PassMode::NOISY : PassMode::DENOISED; \ + return true; \ } \ ((void)0) - /* NOTE: Keep in sync with defined names from DNA_scene_types.h */ + /* NOTE: Keep in sync with defined names from engine.py */ - MAP_PASS("Combined", PASS_COMBINED); - MAP_PASS("Noisy Image", PASS_COMBINED); + MAP_PASS("Combined", PASS_COMBINED, false); + MAP_PASS("Noisy Image", PASS_COMBINED, true); - MAP_PASS("Depth", PASS_DEPTH); - MAP_PASS("Mist", PASS_MIST); - MAP_PASS("Position", PASS_POSITION); - MAP_PASS("Normal", PASS_NORMAL); - MAP_PASS("IndexOB", PASS_OBJECT_ID); - MAP_PASS("UV", PASS_UV); - MAP_PASS("Vector", PASS_MOTION); - MAP_PASS("IndexMA", PASS_MATERIAL_ID); + MAP_PASS("Depth", PASS_DEPTH, false); + MAP_PASS("Mist", PASS_MIST, false); + MAP_PASS("Position", PASS_POSITION, false); + MAP_PASS("Normal", PASS_NORMAL, false); + MAP_PASS("IndexOB", PASS_OBJECT_ID, false); + MAP_PASS("UV", PASS_UV, false); + MAP_PASS("Vector", PASS_MOTION, false); + MAP_PASS("IndexMA", PASS_MATERIAL_ID, false); - MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT); - MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT); - MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT); - MAP_PASS("VolumeDir", PASS_VOLUME_DIRECT); + MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT, false); + MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT, false); + MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT, false); + MAP_PASS("VolumeDir", PASS_VOLUME_DIRECT, false); - MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT); - MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT); - MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT); - MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT); + MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT, false); + MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT, false); + MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT, false); + MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT, false); - MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR); - MAP_PASS("GlossCol", PASS_GLOSSY_COLOR); - MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR); + MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR, false); + MAP_PASS("GlossCol", PASS_GLOSSY_COLOR, false); + MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR, false); - MAP_PASS("Emit", PASS_EMISSION); - MAP_PASS("Env", PASS_BACKGROUND); - MAP_PASS("AO", PASS_AO); - MAP_PASS("Shadow", PASS_SHADOW); + MAP_PASS("Emit", PASS_EMISSION, false); + MAP_PASS("Env", PASS_BACKGROUND, false); + MAP_PASS("AO", PASS_AO, false); + MAP_PASS("Shadow", PASS_SHADOW, false); - MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE); - MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL); + MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE, false); + MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL, false); - MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL); - MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO); - MAP_PASS("Denoising Depth", PASS_DENOISING_DEPTH); + MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL, true); + MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO, true); + MAP_PASS("Denoising Depth", PASS_DENOISING_DEPTH, true); - MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER); - MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER); + MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER, false); + MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER, true); - MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER); - MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT); + MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER, false); + MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT, false); if (string_startswith(name, cryptomatte_prefix)) { - return PASS_CRYPTOMATTE; + type = PASS_CRYPTOMATTE; + mode = PassMode::DENOISED; + return true; } #undef MAP_PASS - return PASS_NONE; + return false; } static Pass *pass_add(Scene *scene, @@ -655,8 +659,6 @@ static Pass *pass_add(Scene *scene, void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer) { - PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); - /* Delete all existing passes. */ set<Pass *> clear_passes(scene->passes.begin(), scene->passes.end()); scene->delete_nodes(clear_passes); @@ -664,103 +666,23 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v /* Always add combined pass. */ pass_add(scene, PASS_COMBINED, "Combined"); - /* Blender built-in data and light passes. */ - for (BL::RenderPass &b_pass : b_rlay.passes) { - const PassType pass_type = get_blender_pass_type(b_pass); - - if (pass_type == PASS_NONE) { - LOG(ERROR) << "Unknown pass " << b_pass.name(); - continue; - } - - if (pass_type == PASS_MOTION && - (b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) { - continue; - } - - pass_add(scene, pass_type, b_pass.name().c_str()); - } - - PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles"); - - /* Debug passes. */ - if (get_boolean(crl, "pass_debug_sample_count")) { - b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str()); - pass_add(scene, PASS_SAMPLE_COUNT, "Debug Sample Count"); - } - - /* Cycles specific passes. */ - if (get_boolean(crl, "use_pass_volume_direct")) { - b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str()); - pass_add(scene, PASS_VOLUME_DIRECT, "VolumeDir"); - } - if (get_boolean(crl, "use_pass_volume_indirect")) { - b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str()); - pass_add(scene, PASS_VOLUME_INDIRECT, "VolumeInd"); - } - if (get_boolean(crl, "use_pass_shadow_catcher")) { - b_engine.add_pass("Shadow Catcher", 3, "RGB", b_view_layer.name().c_str()); - pass_add(scene, PASS_SHADOW_CATCHER, "Shadow Catcher"); - } - /* Cryptomatte stores two ID/weight pairs per RGBA layer. - * User facing parameter is the number of pairs. - * - * NOTE: Name channels lowercase RGBA so that compression rules check in OpenEXR DWA code uses - * lossless compression. Reportedly this naming is the only one which works good from the - * interoperability point of view. Using XYZW naming is not portable. */ + * User facing parameter is the number of pairs. */ int crypto_depth = divide_up(min(16, b_view_layer.pass_cryptomatte_depth()), 2); scene->film->set_cryptomatte_depth(crypto_depth); CryptomatteType cryptomatte_passes = CRYPT_NONE; if (b_view_layer.use_pass_cryptomatte_object()) { - for (int i = 0; i < crypto_depth; i++) { - string passname = cryptomatte_prefix + string_printf("Object%02d", i); - b_engine.add_pass(passname.c_str(), 4, "rgba", b_view_layer.name().c_str()); - pass_add(scene, PASS_CRYPTOMATTE, passname.c_str()); - } cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_OBJECT); } if (b_view_layer.use_pass_cryptomatte_material()) { - for (int i = 0; i < crypto_depth; i++) { - string passname = cryptomatte_prefix + string_printf("Material%02d", i); - b_engine.add_pass(passname.c_str(), 4, "rgba", b_view_layer.name().c_str()); - pass_add(scene, PASS_CRYPTOMATTE, passname.c_str()); - } cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_MATERIAL); } if (b_view_layer.use_pass_cryptomatte_asset()) { - for (int i = 0; i < crypto_depth; i++) { - string passname = cryptomatte_prefix + string_printf("Asset%02d", i); - b_engine.add_pass(passname.c_str(), 4, "rgba", b_view_layer.name().c_str()); - pass_add(scene, PASS_CRYPTOMATTE, passname.c_str()); - } cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ASSET); } scene->film->set_cryptomatte_passes(cryptomatte_passes); - /* Denoising passes. */ - const bool use_denoising = get_boolean(cscene, "use_denoising") && - get_boolean(crl, "use_denoising"); - const bool store_denoising_passes = get_boolean(crl, "denoising_store_passes"); - if (use_denoising) { - b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str()); - pass_add(scene, PASS_COMBINED, "Noisy Image", PassMode::NOISY); - if (get_boolean(crl, "use_pass_shadow_catcher")) { - b_engine.add_pass("Noisy Shadow Catcher", 3, "RGB", b_view_layer.name().c_str()); - pass_add(scene, PASS_SHADOW_CATCHER, "Noisy Shadow Catcher", PassMode::NOISY); - } - } - if (store_denoising_passes) { - b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str()); - pass_add(scene, PASS_DENOISING_NORMAL, "Denoising Normal", PassMode::NOISY); - - b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str()); - pass_add(scene, PASS_DENOISING_ALBEDO, "Denoising Albedo", PassMode::NOISY); - - b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str()); - pass_add(scene, PASS_DENOISING_DEPTH, "Denoising Depth", PassMode::NOISY); - } - + /* Path guiding debug passes. */ #ifdef WITH_CYCLES_DEBUG b_engine.add_pass("Guiding Color", 3, "RGB", b_view_layer.name().c_str()); pass_add(scene, PASS_GUIDING_COLOR, "Guiding Color", PassMode::NOISY); @@ -772,6 +694,8 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v pass_add(scene, PASS_GUIDING_AVG_ROUGHNESS, "Guiding Average Roughness", PassMode::NOISY); #endif + unordered_set<string> expected_passes; + /* Custom AOV passes. */ BL::ViewLayer::aovs_iterator b_aov_iter; for (b_view_layer.aovs.begin(b_aov_iter); b_aov_iter != b_view_layer.aovs.end(); ++b_aov_iter) { @@ -781,16 +705,10 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v } string name = b_aov.name(); - bool is_color = b_aov.type() == BL::AOV::type_COLOR; + PassType type = (b_aov.type() == BL::AOV::type_COLOR) ? PASS_AOV_COLOR : PASS_AOV_VALUE; - if (is_color) { - b_engine.add_pass(name.c_str(), 4, "RGBA", b_view_layer.name().c_str()); - pass_add(scene, PASS_AOV_COLOR, name.c_str()); - } - else { - b_engine.add_pass(name.c_str(), 1, "X", b_view_layer.name().c_str()); - pass_add(scene, PASS_AOV_VALUE, name.c_str()); - } + pass_add(scene, type, name.c_str()); + expected_passes.insert(name); } /* Light Group passes. */ @@ -802,9 +720,29 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v string name = string_printf("Combined_%s", b_lightgroup.name().c_str()); - b_engine.add_pass(name.c_str(), 3, "RGB", b_view_layer.name().c_str()); Pass *pass = pass_add(scene, PASS_COMBINED, name.c_str(), PassMode::NOISY); pass->set_lightgroup(ustring(b_lightgroup.name())); + expected_passes.insert(name); + } + + /* Sync the passes that were defined in engine.py. */ + for (BL::RenderPass &b_pass : b_rlay.passes) { + PassType pass_type = PASS_NONE; + PassMode pass_mode = PassMode::DENOISED; + + if (!get_known_pass_type(b_pass, pass_type, pass_mode)) { + if (!expected_passes.count(b_pass.name())) { + LOG(ERROR) << "Unknown pass " << b_pass.name(); + } + continue; + } + + if (pass_type == PASS_MOTION && + (b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) { + continue; + } + + pass_add(scene, pass_type, b_pass.name().c_str(), pass_mode); } scene->film->set_pass_alpha_threshold(b_view_layer.pass_alpha_threshold()); diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index 9524cda54f5..44542a08156 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -289,8 +289,7 @@ if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_PATH_GUIDING) endif() get_target_property(OPENPGL_INCLUDE_DIR openpgl::openpgl INTERFACE_INCLUDE_DIRECTORIES) else() - set(WITH_CYCLES_PATH_GUIDING OFF) - message(STATUS "OpenPGL not found, disabling WITH_CYCLES_PATH_GUIDING") + set_and_warn_library_found("OpenPGL" openpgl_FOUND WITH_CYCLES_PATH_GUIDING) endif() endif() @@ -588,16 +587,14 @@ if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI) # We can't use the version from the Blender precompiled libraries because # it does not include the video subsystem. find_package(SDL2 REQUIRED) + set_and_warn_library_found("SDL" SDL2_FOUND WITH_CYCLES_STANDALONE_GUI) - if(NOT SDL2_FOUND) - set(WITH_CYCLES_STANDALONE_GUI OFF) - message(STATUS "SDL not found, disabling Cycles standalone GUI") + if(SDL2_FOUND) + include_directories( + SYSTEM + ${SDL2_INCLUDE_DIRS} + ) endif() - - include_directories( - SYSTEM - ${SDL2_INCLUDE_DIRS} - ) endif() ########################################################################### @@ -606,11 +603,11 @@ endif() if(WITH_CYCLES_DEVICE_CUDA AND (WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD)) find_package(CUDA) # Try to auto locate CUDA toolkit + set_and_warn_library_found("CUDA compiler" CUDA_FOUND WITH_CYCLES_CUDA_BINARIES) + if(CUDA_FOUND) message(STATUS "Found CUDA ${CUDA_NVCC_EXECUTABLE} (${CUDA_VERSION})") else() - message(STATUS "CUDA compiler not found, disabling WITH_CYCLES_CUDA_BINARIES") - set(WITH_CYCLES_CUDA_BINARIES OFF) if(NOT WITH_CUDA_DYNLOAD) message(STATUS "Additionally falling back to dynamic CUDA load") set(WITH_CUDA_DYNLOAD ON) @@ -624,11 +621,10 @@ endif() if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP) find_package(HIP) + set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES) + if(HIP_FOUND) message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})") - else() - message(STATUS "HIP compiler not found, disabling WITH_CYCLES_HIP_BINARIES") - set(WITH_CYCLES_HIP_BINARIES OFF) endif() endif() @@ -644,13 +640,17 @@ if(WITH_CYCLES_DEVICE_METAL) find_library(METAL_LIBRARY Metal) # This file was added in the 12.0 SDK, use it as a way to detect the version. - if(METAL_LIBRARY AND NOT EXISTS "${METAL_LIBRARY}/Headers/MTLFunctionStitching.h") - message(STATUS "Metal version too old, must be SDK 12.0 or newer, disabling WITH_CYCLES_DEVICE_METAL") - set(WITH_CYCLES_DEVICE_METAL OFF) - elseif(NOT METAL_LIBRARY) - message(STATUS "Metal not found, disabling WITH_CYCLES_DEVICE_METAL") - set(WITH_CYCLES_DEVICE_METAL OFF) - else() + if(METAL_LIBRARY) + if(EXISTS "${METAL_LIBRARY}/Headers/MTLFunctionStitching.h") + set(METAL_FOUND ON) + else() + message(STATUS "Metal version too old, must be SDK 12.0 or newer") + set(METAL_FOUND OFF) + endif() + endif() + + set_and_warn_library_found("Metal" METAL_FOUND WITH_CYCLES_DEVICE_METAL) + if(METAL_FOUND) message(STATUS "Found Metal: ${METAL_LIBRARY}") endif() endif() @@ -662,9 +662,10 @@ endif() if(WITH_CYCLES_DEVICE_ONEAPI) find_package(SYCL) find_package(LevelZero) + set_and_warn_library_found("oneAPI" SYCL_FOUND WITH_CYCLES_DEVICE_ONEAPI) + set_and_warn_library_found("Level Zero" LEVEL_ZERO_FOUND WITH_CYCLES_DEVICE_ONEAPI) - if(SYCL_FOUND AND LEVEL_ZERO_FOUND) - message(STATUS "Found oneAPI: ${SYCL_LIBRARY}") + if(SYCL_FOUND AND SYCL_VERSION VERSION_GREATER_EQUAL 6.0 AND LEVEL_ZERO_FOUND) message(STATUS "Found Level Zero: ${LEVEL_ZERO_LIBRARY}") if(WITH_CYCLES_ONEAPI_BINARIES) @@ -675,13 +676,14 @@ if(WITH_CYCLES_DEVICE_ONEAPI) endif() if(NOT EXISTS ${OCLOC_INSTALL_DIR}) - message(STATUS "oneAPI ocloc not found in ${OCLOC_INSTALL_DIR}, disabling WITH_CYCLES_ONEAPI_BINARIES." + set(OCLOC_FOUND OFF) + message(STATUS "oneAPI ocloc not found in ${OCLOC_INSTALL_DIR}." " A different ocloc directory can be set using OCLOC_INSTALL_DIR cmake variable.") - set(WITH_CYCLES_ONEAPI_BINARIES OFF) + set_and_warn_library_found("ocloc" OCLOC_FOUND WITH_CYCLES_ONEAPI_BINARIES) endif() endif() else() - message(STATUS "oneAPI or Level Zero not found, disabling WITH_CYCLES_DEVICE_ONEAPI") + message(STATUS "SYCL 6.0+ or Level Zero not found, disabling WITH_CYCLES_DEVICE_ONEAPI") set(WITH_CYCLES_DEVICE_ONEAPI OFF) endif() endif() diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 5516e97f34f..5296d819e42 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -187,18 +187,22 @@ if(WITH_CYCLES_DEVICE_METAL) ) endif() if (WITH_CYCLES_DEVICE_ONEAPI) + if(WITH_CYCLES_ONEAPI_BINARIES) + set(cycles_kernel_oneapi_lib_suffix "_aot") + else() + set(cycles_kernel_oneapi_lib_suffix "_jit") + endif() if(WIN32) - set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi.lib) + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.lib) else() - set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi.so) + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.so) + endif() + list(APPEND LIB ${cycles_kernel_oneapi_lib}) + if(WIN32) + list(APPEND LIB debug ${SYCL_LIBRARY_DEBUG} optimized ${SYCL_LIBRARY}) + else() + list(APPEND LIB ${SYCL_LIBRARY}) endif() - list(APPEND LIB - ${cycles_kernel_oneapi_lib} - "$<$<CONFIG:Debug>:${SYCL_LIBRARY_DEBUG}>" - "$<$<CONFIG:Release>:${SYCL_LIBRARY}>" - "$<$<CONFIG:RelWithDebInfo>:${SYCL_LIBRARY}>" - "$<$<CONFIG:MinSizeRel>:${SYCL_LIBRARY}>" - ) add_definitions(-DWITH_ONEAPI) list(APPEND SRC ${SRC_ONEAPI} diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index 84b0a1e0dd6..69fae03e32c 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int CUDADeviceQueue::num_concurrent_busy_states() const +int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = cuda_device_->get_num_multiprocessors() * cuda_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h index b450f5b3592..7107afe70c9 100644 --- a/intern/cycles/device/cuda/queue.h +++ b/intern/cycles/device/cuda/queue.h @@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue { ~CUDADeviceQueue(); virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index 3f8b6267100..e93a9b4df3a 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int HIPDeviceQueue::num_concurrent_busy_states() const +int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = hip_device_->get_num_multiprocessors() * hip_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h index 729d8a19acb..df0678108af 100644 --- a/intern/cycles/device/hip/queue.h +++ b/intern/cycles/device/hip/queue.h @@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue { ~HIPDeviceQueue(); virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index d1250b83d22..6f1042b1e55 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -254,6 +254,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat break; } + NSProcessInfo *processInfo = [NSProcessInfo processInfo]; + NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion]; + global_defines += "#define __KERNEL_METAL_MACOS__ " + to_string(macos_ver.majorVersion) + "\n"; + string &source = this->source[pso_type]; source = "\n#include \"kernel/device/metal/kernel.metal\"\n"; source = path_source_replace_includes(source, path_get("source")); @@ -292,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat } source = global_defines + source; +# if 0 metal_printf("================\n%s================\n\%s================\n", global_defines.c_str(), baked_constants.c_str()); +# endif /* Generate an MD5 from the source and include any baked constants. This is used when caching * PSOs. */ @@ -335,6 +341,14 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type) MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; +# if defined(MAC_OS_VERSION_13_0) + if (@available(macos 13.0, *)) { + if (device_vendor == METAL_GPU_INTEL) { + [options setOptimizationLevel:MTLLibraryOptimizationLevelSize]; + } + } +# endif + options.fastMathEnabled = YES; if (@available(macOS 12.0, *)) { options.languageVersion = MTLLanguageVersion2_4; diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 5e0cb6d18f4..55938d1a03a 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -162,6 +162,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel, } } + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) { + if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) { + /* Skip shade_surface_mnee kernel if the scene doesn't require it. */ + return false; + } + } + if (pso_type != PSO_GENERIC) { /* Only specialize kernels where it can make an impact. */ if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || @@ -317,6 +324,12 @@ bool MetalKernelPipeline::should_use_binary_archive() const } } + /* Workaround for Intel GPU having issue using Binary Archives */ + MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice); + if (gpu_vendor == METAL_GPU_INTEL) { + return false; + } + if (pso_type == PSO_GENERIC) { /* Archive the generic kernels. */ return true; diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index fc32740f3e1..2a6c12e2a60 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue { ~MetalDeviceQueue(); virtual int num_concurrent_states(const size_t) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t) const override; virtual int num_sort_partition_elements() const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 5ac63a16c61..c0df2c8553f 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue() } } -int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const { - /* METAL_WIP */ - /* TODO: compute automatically. */ - /* TODO: must have at least num_threads_per_block. */ - int result = 1048576; - if (metal_device_->device_vendor == METAL_GPU_AMD) { - result *= 2; + static int result = 0; + if (result) { + return result; } - else if (metal_device_->device_vendor == METAL_GPU_APPLE) { + + result = 1048576; + if (metal_device_->device_vendor == METAL_GPU_APPLE) { result *= 4; + + if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) { + size_t system_ram = system_physical_ram(); + size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize]; + size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize]; + + /* Determine whether we can double the state count, and leave enough GPU-available memory + * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to + * keep dispatch sizes high and minimize work submission overheads. */ + size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024)); + size_t total_state_size = result * state_size; + if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) { + result *= 2; + metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result); + } + } + } + else if (metal_device_->device_vendor == METAL_GPU_AMD) { + /* METAL_WIP */ + /* TODO: compute automatically. */ + /* TODO: must have at least num_threads_per_block. */ + result *= 2; } return result; } -int MetalDeviceQueue::num_concurrent_busy_states() const +int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const { - /* METAL_WIP */ - /* TODO: compute automatically. */ - int result = 65536; - if (metal_device_->device_vendor == METAL_GPU_AMD) { - result *= 2; - } - else if (metal_device_->device_vendor == METAL_GPU_APPLE) { - result *= 4; - } - return result; + /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */ + return num_concurrent_states(state_size) / 4; } int MetalDeviceQueue::num_sort_partition_elements() const diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index 65c67c400fe..f47638fac15 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -110,6 +110,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() usable |= (vendor == METAL_GPU_AMD); } +# if defined(MAC_OS_VERSION_13_0) + if (@available(macos 13.0, *)) { + usable |= (vendor == METAL_GPU_INTEL); + } +# endif + if (usable) { metal_printf("- %s\n", device_name.c_str()); [device retain]; diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp index f303ab41627..66d6f749e30 100644 --- a/intern/cycles/device/oneapi/device.cpp +++ b/intern/cycles/device/oneapi/device.cpp @@ -39,7 +39,7 @@ bool device_oneapi_init() _putenv_s("SYCL_CACHE_THRESHOLD", "0"); } if (getenv("SYCL_DEVICE_FILTER") == nullptr) { - _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero"); + _putenv_s("SYCL_DEVICE_FILTER", "level_zero"); } if (getenv("SYCL_ENABLE_PCI") == nullptr) { _putenv_s("SYCL_ENABLE_PCI", "1"); @@ -50,7 +50,7 @@ bool device_oneapi_init() # elif __linux__ setenv("SYCL_CACHE_PERSISTENT", "1", false); setenv("SYCL_CACHE_THRESHOLD", "0", false); - setenv("SYCL_DEVICE_FILTER", "host,level_zero", false); + setenv("SYCL_DEVICE_FILTER", "level_zero", false); setenv("SYCL_ENABLE_PCI", "1", false); setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false); # endif diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index 2df605fa047..3588b75713b 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -43,7 +43,7 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi } size_t globals_segment_size; - is_finished_ok = kernel_globals_size(device_queue_, globals_segment_size); + is_finished_ok = kernel_globals_size(globals_segment_size); if (is_finished_ok == false) { set_error("oneAPI constant memory initialization got runtime exception \"" + oneapi_error_string_ + "\""); @@ -88,18 +88,26 @@ BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const bool OneapiDevice::load_kernels(const uint requested_features) { assert(device_queue_); - /* NOTE(@nsirgien): oneAPI can support compilation of kernel code with certain feature set - * with specialization constants, but it hasn't been implemented yet. */ - (void)requested_features; bool is_finished_ok = oneapi_run_test_kernel(device_queue_); if (is_finished_ok == false) { - set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\""); + set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ + + "\""); + return false; } else { - VLOG_INFO << "Runtime compilation done for \"" << info.description << "\""; + VLOG_INFO << "Test kernel has been executed successfully for \"" << info.description << "\""; assert(device_queue_); } + + is_finished_ok = oneapi_load_kernels(device_queue_, (const unsigned int)requested_features); + if (is_finished_ok == false) { + set_error("oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ + "\""); + } + else { + VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\""; + } + return is_finished_ok; } @@ -422,9 +430,14 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_ sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); (void)usm_type; assert(usm_type == sycl::usm::alloc::device || - ((device_type == sycl::info::device_type::host || - device_type == sycl::info::device_type::cpu || allow_host) && - usm_type == sycl::usm::alloc::host)); + ((device_type == sycl::info::device_type::cpu || allow_host) && + usm_type == sycl::usm::alloc::host || + usm_type == sycl::usm::alloc::unknown)); +# else + /* Silence warning about unused arguments. */ + (void)queue_; + (void)usm_ptr; + (void)allow_host; # endif } @@ -552,7 +565,7 @@ bool OneapiDevice::queue_synchronize(SyclQueue *queue_) } } -bool OneapiDevice::kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size) +bool OneapiDevice::kernel_globals_size(size_t &kernel_global_size) { kernel_global_size = sizeof(KernelGlobalsGPU); @@ -658,14 +671,6 @@ std::vector<sycl::device> OneapiDevice::available_devices() if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) allow_all_devices = true; - /* Host device is useful only for debugging at the moment - * so we hide this device with default build settings. */ -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - bool allow_host = true; -# else - bool allow_host = false; -# endif - const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms(); std::vector<sycl::device> available_devices; @@ -677,17 +682,11 @@ std::vector<sycl::device> OneapiDevice::available_devices() } const std::vector<sycl::device> &oneapi_devices = - (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) : - platform.get_devices(sycl::info::device_type::gpu); + (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) : + platform.get_devices(sycl::info::device_type::gpu); for (const sycl::device &device : oneapi_devices) { - if (allow_all_devices) { - /* still filter out host device if build doesn't support it. */ - if (allow_host || !device.is_host()) { - available_devices.push_back(device); - } - } - else { + if (!allow_all_devices) { bool filter_out = false; /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU, @@ -699,11 +698,11 @@ std::vector<sycl::device> OneapiDevice::available_devices() int number_of_eus = 96; int threads_per_eu = 7; if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { - number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); + number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>(); } if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { threads_per_eu = - device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); + device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>(); } /* This filters out all Level-Zero supported GPUs from older generation than Arc. */ if (number_of_eus <= 96 && threads_per_eu == 7) { @@ -719,9 +718,6 @@ std::vector<sycl::device> OneapiDevice::available_devices() } } } - else if (!allow_host && device.is_host()) { - filter_out = true; - } else if (!allow_all_devices) { filter_out = true; } @@ -784,9 +780,7 @@ char *OneapiDevice::device_capabilities() GET_NUM_ATTR(native_vector_width_double) GET_NUM_ATTR(native_vector_width_half) - size_t max_clock_frequency = - (size_t)(device.is_host() ? (size_t)0 : - device.get_info<sycl::info::device::max_clock_frequency>()); + size_t max_clock_frequency = device.get_info<sycl::info::device::max_clock_frequency>(); WRITE_ATTR("max_clock_frequency", max_clock_frequency) GET_NUM_ATTR(address_bits) @@ -824,7 +818,7 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p std::string name = device.get_info<sycl::info::device::name>(); std::string id = "ONEAPI_" + platform_name + "_" + name; if (device.has(sycl::aspect::ext_intel_pci_address)) { - id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>()); + id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>()); } (cb)(id.c_str(), name.c_str(), num, user_ptr); num++; @@ -842,7 +836,7 @@ int OneapiDevice::get_num_multiprocessors() { const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device(); if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { - return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); + return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>(); } else return 0; @@ -853,8 +847,8 @@ int OneapiDevice::get_max_num_threads_per_multiprocessor() const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device(); if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) && device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { - return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() * - device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); + return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() * + device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>(); } else return 0; diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h index 3589e881a6e..197cf03d60d 100644 --- a/intern/cycles/device/oneapi/device_impl.h +++ b/intern/cycles/device/oneapi/device_impl.h @@ -3,7 +3,7 @@ #ifdef WITH_ONEAPI -# include <CL/sycl.hpp> +# include <sycl/sycl.hpp> # include "device/device.h" # include "device/oneapi/device.h" @@ -104,7 +104,7 @@ class OneapiDevice : public Device { int get_num_multiprocessors(); int get_max_num_threads_per_multiprocessor(); bool queue_synchronize(SyclQueue *queue); - bool kernel_globals_size(SyclQueue *queue, size_t &kernel_global_size); + bool kernel_globals_size(size_t &kernel_global_size); void set_global_memory(SyclQueue *queue, void *kernel_globals, const char *memory_name, diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp index 9632b14d485..3d019661aa8 100644 --- a/intern/cycles/device/oneapi/queue.cpp +++ b/intern/cycles/device/oneapi/queue.cpp @@ -43,7 +43,7 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int OneapiDeviceQueue::num_concurrent_busy_states() const +int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = oneapi_device_->get_num_multiprocessors() * oneapi_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h index 32363bf2a6e..bbd947b49cb 100644 --- a/intern/cycles/device/oneapi/queue.h +++ b/intern/cycles/device/oneapi/queue.h @@ -25,7 +25,7 @@ class OneapiDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 1d6a8d736b7..e27e081a407 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -103,7 +103,7 @@ class DeviceQueue { /* Number of states which keeps the device occupied with work without losing performance. * The renderer will add more work (when available) when number of active paths falls below this * value. */ - virtual int num_concurrent_busy_states() const = 0; + virtual int num_concurrent_busy_states(const size_t state_size) const = 0; /* Number of elements in a partition of sorted shaders, that improves memory locality of * integrator state fetch at the cost of decreased coherence for shader kernel execution. */ diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index 6b033cfd051..8e8fbd86be0 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -43,8 +43,11 @@ PathTrace::PathTrace(Device *device, /* Create path tracing work in advance, so that it can be reused by incremental sampling as much * as possible. */ device_->foreach_device([&](Device *path_trace_device) { - path_trace_works_.emplace_back(PathTraceWork::create( - path_trace_device, film, device_scene, &render_cancel_.is_requested)); + unique_ptr<PathTraceWork> work = PathTraceWork::create( + path_trace_device, film, device_scene, &render_cancel_.is_requested); + if (work) { + path_trace_works_.emplace_back(std::move(work)); + } }); work_balance_infos_.resize(path_trace_works_.size()); @@ -1293,6 +1296,7 @@ void PathTrace::set_guiding_params(const GuidingParams &guiding_params, const bo # if OPENPGL_VERSION_MINOR >= 4 field_args.deterministic = guiding_params.deterministic; # endif + reinterpret_cast<PGLKDTreeArguments *>(field_args.spatialSturctureArguments)->maxDepth = 16; openpgl::cpp::Device *guiding_device = static_cast<openpgl::cpp::Device *>( device_->get_guiding_device()); if (guiding_device) { diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp index bb5c6e1a61a..a5f98b5475a 100644 --- a/intern/cycles/integrator/path_trace_work.cpp +++ b/intern/cycles/integrator/path_trace_work.cpp @@ -23,6 +23,10 @@ unique_ptr<PathTraceWork> PathTraceWork::create(Device *device, if (device->info.type == DEVICE_CPU) { return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag); } + if (device->info.type == DEVICE_DUMMY) { + /* Dummy devices can't perform any work. */ + return nullptr; + } return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag); } diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp index d5ac830db58..188ec28cf65 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.cpp +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -285,7 +285,7 @@ void PathTraceWorkCPU::cryptomatte_postproces() } #ifdef WITH_PATH_GUIDING -/* Note: It seems that this is called before every rendering iteration/progression and not once per +/* NOTE: It seems that this is called before every rendering iteration/progression and not once per * rendering. May be we find a way to call it only once per rendering. */ void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field, void *sample_data_storage, diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index ee250a6916b..48f6cf3c903 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -18,13 +18,15 @@ CCL_NAMESPACE_BEGIN -static size_t estimate_single_state_size() +static size_t estimate_single_state_size(const uint kernel_features) { size_t state_size = 0; #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) { -#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type); -#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type); +#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + state_size += (kernel_features & (feature)) ? sizeof(type) : 0; +#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ + state_size += (kernel_features & (feature)) ? sizeof(type) : 0; #define KERNEL_STRUCT_END(name) \ break; \ } @@ -76,16 +78,11 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device, num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE), work_tiles_(device, "work_tiles", MEM_READ_WRITE), display_rgba_half_(device, "display buffer half", MEM_READ_WRITE), - max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())), - min_num_active_main_paths_(queue_->num_concurrent_busy_states()), + max_num_paths_(0), + min_num_active_main_paths_(0), max_active_main_path_index_(0) { memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_)); - - /* Limit number of active paths to the half of the overall state. This is due to the logic in the - * path compaction which relies on the fact that regeneration does not happen sooner than half of - * the states are available again. */ - min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2); } void PathTraceWorkGPU::alloc_integrator_soa() @@ -103,6 +100,20 @@ void PathTraceWorkGPU::alloc_integrator_soa() integrator_state_soa_volume_stack_size_ = max(integrator_state_soa_volume_stack_size_, requested_volume_stack_size); + /* Deterine the number of path states. Deferring this for as long as possible allows the backend + * to make better decisions about memory availability. */ + if (max_num_paths_ == 0) { + size_t single_state_size = estimate_single_state_size(kernel_features); + + max_num_paths_ = queue_->num_concurrent_states(single_state_size); + min_num_active_main_paths_ = queue_->num_concurrent_busy_states(single_state_size); + + /* Limit number of active paths to the half of the overall state. This is due to the logic in + * the path compaction which relies on the fact that regeneration does not happen sooner than + * half of the states are available again. */ + min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2); + } + /* Allocate a device only memory buffer before for each struct member, and then * write the pointers into a struct that resides in constant memory. * diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp index 5f1c6c92b9d..0fe170b2791 100644 --- a/intern/cycles/integrator/work_balancer.cpp +++ b/intern/cycles/integrator/work_balancer.cpp @@ -17,6 +17,9 @@ void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos) work_balance_infos[0].weight = 1.0; return; } + else if (num_infos == 0) { + return; + } /* There is no statistics available, so start with an equal distribution. */ const double weight = 1.0 / num_infos; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 8f50c7586b8..81c5f593974 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -713,10 +713,17 @@ endif() # oneAPI module if(WITH_CYCLES_DEVICE_ONEAPI) + if(WITH_CYCLES_ONEAPI_BINARIES) + set(cycles_kernel_oneapi_lib_suffix "_aot") + else() + set(cycles_kernel_oneapi_lib_suffix "_jit") + endif() + if(WIN32) - set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll) + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.dll) + set(cycles_kernel_oneapi_linker_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.lib) else() - set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/libcycles_kernel_oneapi.so) + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/libcycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.so) endif() set(cycles_oneapi_kernel_sources @@ -727,16 +734,9 @@ if(WITH_CYCLES_DEVICE_ONEAPI) ${SRC_UTIL_HEADERS} ) - set (ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS 1) + set (SYCL_OFFLINE_COMPILER_PARALLEL_JOBS 1 CACHE STRING "Number of parallel compiler instances to use for device binaries compilation (expect ~8GB peak memory usage per instance).") if (WITH_CYCLES_ONEAPI_BINARIES) - cmake_host_system_information(RESULT AVAILABLE_MEMORY_AMOUNT QUERY AVAILABLE_PHYSICAL_MEMORY) - # Conservative value of peak consumption here, just to be fully sure that other backend compilers will have enough memory as well - set(ONEAPI_GPU_COMPILER_MEMORY_AT_PEAK_MB 8150) - math(EXPR ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS "${AVAILABLE_MEMORY_AMOUNT} / ${ONEAPI_GPU_COMPILER_MEMORY_AT_PEAK_MB}") - if (ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS LESS 1) - set(ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS 1) - endif() - message(STATUS "${ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS} instance(s) of oneAPI offline compiler will be used.") + message(STATUS "${SYCL_OFFLINE_COMPILER_PARALLEL_JOBS} instance(s) of oneAPI offline compiler will be used.") endif() # SYCL_CPP_FLAGS is a variable that the user can set to pass extra compiler options set(sycl_compiler_flags @@ -747,7 +747,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI) -mllvm -inlinedefault-threshold=250 -mllvm -inlinehint-threshold=350 -fsycl-device-code-split=per_kernel - -fsycl-max-parallel-link-jobs=${ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS} + -fsycl-max-parallel-link-jobs=${SYCL_OFFLINE_COMPILER_PARALLEL_JOBS} -shared -DWITH_ONEAPI -ffast-math @@ -758,10 +758,6 @@ if(WITH_CYCLES_DEVICE_ONEAPI) ${SYCL_CPP_FLAGS} ) - if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED) - list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED) - endif() - # Set defaults for spir64 and spir64_gen options if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64) set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'") @@ -774,6 +770,8 @@ if(WITH_CYCLES_DEVICE_ONEAPI) string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ") if (WITH_CYCLES_ONEAPI_BINARIES) + # AoT binaries aren't currently reused when calling sycl::build. + list (APPEND sycl_compiler_flags -DSYCL_SKIP_KERNELS_PRELOAD) # Iterate over all targest and their options list (JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string) list (APPEND sycl_compiler_flags -fsycl-targets=${targets_string}) @@ -826,12 +824,17 @@ if(WITH_CYCLES_DEVICE_ONEAPI) -DONEAPI_EXPORT) string(REPLACE /Redist/ /Tools/ MSVC_TOOLS_DIR ${MSVC_REDIST_DIR}) - if(NOT CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION) # case for Ninja on Windows + # Version Folder between Redist and Tools can mismatch sometimes + if(NOT EXISTS ${MSVC_TOOLS_DIR}) + get_filename_component(cmake_ar_dir ${CMAKE_AR} DIRECTORY) + get_filename_component(MSVC_TOOLS_DIR "${cmake_ar_dir}/../../../" ABSOLUTE) + endif() + if(CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION) + set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}) + else() # case for Ninja on Windows get_filename_component(cmake_mt_dir ${CMAKE_MT} DIRECTORY) string(REPLACE /bin/ /Lib/ WINDOWS_KIT_DIR ${cmake_mt_dir}) get_filename_component(WINDOWS_KIT_DIR "${WINDOWS_KIT_DIR}/../" ABSOLUTE) - else() - set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}) endif() list(APPEND sycl_compiler_flags -L "${MSVC_TOOLS_DIR}/lib/x64" @@ -843,15 +846,13 @@ if(WITH_CYCLES_DEVICE_ONEAPI) set(sycl_compiler_flags_RelWithDebInfo ${sycl_compiler_flags}) set(sycl_compiler_flags_MinSizeRel ${sycl_compiler_flags}) list(APPEND sycl_compiler_flags_RelWithDebInfo -g) - get_filename_component(sycl_library_debug_name ${SYCL_LIBRARY_DEBUG} NAME_WE) list(APPEND sycl_compiler_flags_Debug -g -D_DEBUG - -nostdlib -Xclang --dependent-lib=msvcrtd - -Xclang --dependent-lib=${sycl_library_debug_name}) + -nostdlib -Xclang --dependent-lib=msvcrtd) add_custom_command( - OUTPUT ${cycles_kernel_oneapi_lib} + OUTPUT ${cycles_kernel_oneapi_lib} ${cycles_kernel_oneapi_linker_lib} COMMAND ${CMAKE_COMMAND} -E env "LIB=${sycl_compiler_root}/../lib" # for compiler to find sycl.lib "PATH=${OCLOC_INSTALL_DIR}\;${sycl_compiler_root}" diff --git a/intern/cycles/kernel/bvh/shadow_all.h b/intern/cycles/kernel/bvh/shadow_all.h index 2ffe1496c72..b31ba479e4f 100644 --- a/intern/cycles/kernel/bvh/shadow_all.h +++ b/intern/cycles/kernel/bvh/shadow_all.h @@ -229,7 +229,7 @@ ccl_device_inline /* Always use baked shadow transparency for curves. */ if (isect.type & PRIMITIVE_CURVE) { *r_throughput *= intersection_curve_shadow_transparency( - kg, isect.object, isect.prim, isect.u); + kg, isect.object, isect.prim, isect.type, isect.u); if (*r_throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) { return true; diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h index a57703a8b8c..9ba787550c5 100644 --- a/intern/cycles/kernel/bvh/util.h +++ b/intern/cycles/kernel/bvh/util.h @@ -190,10 +190,8 @@ ccl_device_inline int intersection_find_attribute(KernelGlobals kg, /* Cut-off value to stop transparent shadow tracing when practically opaque. */ #define CURVE_SHADOW_TRANSPARENCY_CUTOFF 0.001f -ccl_device_inline float intersection_curve_shadow_transparency(KernelGlobals kg, - const int object, - const int prim, - const float u) +ccl_device_inline float intersection_curve_shadow_transparency( + KernelGlobals kg, const int object, const int prim, const int type, const float u) { /* Find attribute. */ const int offset = intersection_find_attribute(kg, object, ATTR_STD_SHADOW_TRANSPARENCY); @@ -204,7 +202,7 @@ ccl_device_inline float intersection_curve_shadow_transparency(KernelGlobals kg, /* Interpolate transparency between curve keys. */ const KernelCurve kcurve = kernel_data_fetch(curves, prim); - const int k0 = kcurve.first_key + PRIMITIVE_UNPACK_SEGMENT(kcurve.type); + const int k0 = kcurve.first_key + PRIMITIVE_UNPACK_SEGMENT(type); const int k1 = k0 + 1; const float f0 = kernel_data_fetch(attributes_float, offset + k0); diff --git a/intern/cycles/kernel/device/cpu/bvh.h b/intern/cycles/kernel/device/cpu/bvh.h index d9267e1cd6d..2d7d8c2d704 100644 --- a/intern/cycles/kernel/device/cpu/bvh.h +++ b/intern/cycles/kernel/device/cpu/bvh.h @@ -252,7 +252,7 @@ ccl_device void kernel_embree_filter_occluded_func(const RTCFilterFunctionNArgum /* Always use baked shadow transparency for curves. */ if (current_isect.type & PRIMITIVE_CURVE) { ctx->throughput *= intersection_curve_shadow_transparency( - kg, current_isect.object, current_isect.prim, current_isect.u); + kg, current_isect.object, current_isect.prim, current_isect.type, current_isect.u); if (ctx->throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) { ctx->opaque_hit = true; diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index c1df49c4f49..38cdcb572eb 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -23,22 +23,6 @@ CCL_NAMESPACE_BEGIN * and keep device specific code in compat.h */ #ifdef __KERNEL_ONEAPI__ -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED -template<typename IsActiveOp> -void cpu_serial_active_index_array_impl(const uint num_states, - ccl_global int *ccl_restrict indices, - ccl_global int *ccl_restrict num_indices, - IsActiveOp is_active_op) -{ - int write_index = 0; - for (int state_index = 0; state_index < num_states; state_index++) { - if (is_active_op(state_index)) - indices[write_index++] = state_index; - } - *num_indices = write_index; - return; -} -# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */ template<typename IsActiveOp> void gpu_parallel_active_index_array_impl(const uint num_states, @@ -182,18 +166,11 @@ __device__ num_simd_groups, \ simdgroup_offset) #elif defined(__KERNEL_ONEAPI__) -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED -# define gpu_parallel_active_index_array( \ - blocksize, num_states, indices, num_indices, is_active_op) \ - if (ccl_gpu_global_size_x() == 1) \ - cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \ - else \ - gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op); -# else -# define gpu_parallel_active_index_array( \ - blocksize, num_states, indices, num_indices, is_active_op) \ - gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) -# endif + +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) + #else # define gpu_parallel_active_index_array( \ diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h index 99cb1e3826e..e75ec9cadec 100644 --- a/intern/cycles/kernel/device/metal/context_begin.h +++ b/intern/cycles/kernel/device/metal/context_begin.h @@ -34,21 +34,48 @@ class MetalKernelContext { kernel_assert(0); return 0; } - + +#ifdef __KERNEL_METAL_INTEL__ + template<typename TextureType, typename CoordsType> + inline __attribute__((__always_inline__)) + auto ccl_gpu_tex_object_read_intel_workaround(TextureType texture_array, + const uint tid, const uint sid, + CoordsType coords) const + { + switch(sid) { + default: + case 0: return texture_array[tid].tex.sample(sampler(address::repeat, filter::nearest), coords); + case 1: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::nearest), coords); + case 2: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::nearest), coords); + case 3: return texture_array[tid].tex.sample(sampler(address::repeat, filter::linear), coords); + case 4: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::linear), coords); + case 5: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::linear), coords); + } + } +#endif + // texture2d template<> inline __attribute__((__always_inline__)) float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const { const uint tid(tex); const uint sid(tex >> 32); +#ifndef __KERNEL_METAL_INTEL__ return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)); +#else + return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y)); +#endif } template<> inline __attribute__((__always_inline__)) float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const { const uint tid(tex); const uint sid(tex >> 32); +#ifndef __KERNEL_METAL_INTEL__ return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x; +#else + return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y)).x; +#endif } // texture3d @@ -57,14 +84,22 @@ class MetalKernelContext { float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const { const uint tid(tex); const uint sid(tex >> 32); +#ifndef __KERNEL_METAL_INTEL__ return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)); +#else + return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z)); +#endif } template<> inline __attribute__((__always_inline__)) float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const { const uint tid(tex); const uint sid(tex >> 32); +#ifndef __KERNEL_METAL_INTEL__ return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x; +#else + return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z)).x; +#endif } # include "kernel/device/gpu/image.h" diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal index 5646c7446db..8b69ee025cd 100644 --- a/intern/cycles/kernel/device/metal/kernel.metal +++ b/intern/cycles/kernel/device/metal/kernel.metal @@ -228,7 +228,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal, /* Always use baked shadow transparency for curves. */ if (type & PRIMITIVE_CURVE) { float throughput = payload.throughput; - throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, u); + throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, type, u); payload.throughput = throughput; payload.num_hits += 1; diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h index 8ae40b0612e..dfaec65130c 100644 --- a/intern/cycles/kernel/device/oneapi/compat.h +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -55,18 +55,6 @@ #define ccl_gpu_kernel(block_num_threads, thread_num_registers) #define ccl_gpu_kernel_threads(block_num_threads) -#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED -# define KG_ND_ITEMS \ - kg->nd_item_local_id_0 = item.get_local_id(0); \ - kg->nd_item_local_range_0 = item.get_local_range(0); \ - kg->nd_item_group_0 = item.get_group(0); \ - kg->nd_item_group_range_0 = item.get_group_range(0); \ - kg->nd_item_global_id_0 = item.get_global_id(0); \ - kg->nd_item_global_range_0 = item.get_global_range(0); -#else -# define KG_ND_ITEMS -#endif - #define ccl_gpu_kernel_signature(name, ...) \ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ size_t kernel_global_size, \ @@ -76,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ (kg); \ cgh.parallel_for<class kernel_##name>( \ sycl::nd_range<1>(kernel_global_size, kernel_local_size), \ - [=](sycl::nd_item<1> item) { \ - KG_ND_ITEMS + [=](sycl::nd_item<1> item) { #define ccl_gpu_kernel_postfix \ }); \ @@ -95,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg) /* GPU thread, block, grid size and index */ -#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED -# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) -# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) -# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) -# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) -# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) -# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) - -# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) -# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) -#else -# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0) -# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0) -# define ccl_gpu_block_idx_x (kg->nd_item_group_0) -# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0) -# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) -# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) - -# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0) -# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0) -#endif +#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) +#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) +#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) +#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) +#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) +#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) +#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) /* GPU warp synchronization */ - #define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier() #define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space) #ifdef __SYCL_DEVICE_ONLY__ diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h index d60f4f135ba..116620eb725 100644 --- a/intern/cycles/kernel/device/oneapi/globals.h +++ b/intern/cycles/kernel/device/oneapi/globals.h @@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU { #undef KERNEL_DATA_ARRAY IntegratorStateGPU *integrator_state; const KernelData *__data; -#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - size_t nd_item_local_id_0; - size_t nd_item_local_range_0; - size_t nd_item_group_0; - size_t nd_item_group_range_0; - - size_t nd_item_global_id_0; - size_t nd_item_global_range_0; -#endif } KernelGlobalsGPU; typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals; diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index 1d1700f036d..525ae288f0c 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -8,7 +8,7 @@ # include <map> # include <set> -# include <CL/sycl.hpp> +# include <sycl/sycl.hpp> # include "kernel/device/oneapi/compat.h" # include "kernel/device/oneapi/globals.h" @@ -25,38 +25,57 @@ void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr) s_error_user_ptr = user_ptr; } -/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and - * also trigger runtime compilation of all existing oneAPI kernels */ +/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like + * memory allocations, memory transfers and execution of kernel with USM memory. */ bool oneapi_run_test_kernel(SyclQueue *queue_) { assert(queue_); sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - size_t N = 8; - sycl::buffer<float, 1> A(N); - sycl::buffer<float, 1> B(N); - - { - sycl::host_accessor A_host_acc(A, sycl::write_only); - for (size_t i = (size_t)0; i < N; i++) - A_host_acc[i] = rand() % 32; - } + const size_t N = 8; + const size_t memory_byte_size = sizeof(int) * N; + bool is_computation_correct = true; try { - queue->submit([&](sycl::handler &cgh) { - sycl::accessor A_acc(A, cgh, sycl::read_only); - sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init); + int *A_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue); + + for (size_t i = (size_t)0; i < N; i++) { + A_host[i] = rand() % 32; + } + + int *A_device = (int *)sycl::malloc_device(memory_byte_size, *queue); + int *B_device = (int *)sycl::malloc_device(memory_byte_size, *queue); - cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); }); + queue->memcpy(A_device, A_host, memory_byte_size); + queue->wait_and_throw(); + + queue->submit([&](sycl::handler &cgh) { + cgh.parallel_for(N, [=](sycl::id<1> idx) { B_device[idx] = A_device[idx] + idx.get(0); }); }); queue->wait_and_throw(); - sycl::host_accessor A_host_acc(A, sycl::read_only); - sycl::host_accessor B_host_acc(B, sycl::read_only); + int *B_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue); + + queue->memcpy(B_host, B_device, memory_byte_size); + queue->wait_and_throw(); for (size_t i = (size_t)0; i < N; i++) { - float result = A_host_acc[i] + B_host_acc[i]; - (void)result; + const int expected_result = i + A_host[i]; + if (B_host[i] != expected_result) { + is_computation_correct = false; + if (s_error_cb) { + s_error_cb(("Incorrect result in test kernel execution - expected " + + std::to_string(expected_result) + ", got " + std::to_string(B_host[i])) + .c_str(), + s_error_user_ptr); + } + } } + + sycl::free(A_host, *queue); + sycl::free(B_host, *queue); + sycl::free(A_device, *queue); + sycl::free(B_device, *queue); + queue->wait_and_throw(); } catch (sycl::exception const &e) { if (s_error_cb) { @@ -65,7 +84,7 @@ bool oneapi_run_test_kernel(SyclQueue *queue_) return false; } - return true; + return is_computation_correct; } /* TODO: Move device information to OneapiDevice initialized on creation and use it. */ @@ -123,6 +142,56 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue, return std::min(limit_work_group_size, preferred_work_group_size); } +bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features) +{ +# ifdef SYCL_SKIP_KERNELS_PRELOAD + (void)queue_; + (void)requested_features; +# else + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + + try { + sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle = + sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), + {queue->get_device()}); + + for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) { + const std::string &kernel_name = kernel_id.get_name(); + + /* NOTE(@nsirgien): Names in this conditions below should match names from + * oneapi_call macro in oneapi_enqueue_kernel below */ + if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) { + continue; + } + + if (((requested_features & KERNEL_FEATURE_MNEE) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) { + continue; + } + + if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") != + std::string::npos) { + continue; + } + + sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle = + sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id}); + sycl::build(one_kernel_bundle); + } + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +# endif + return true; +} + bool oneapi_enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, @@ -161,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices, * we extend work size to fit uniformity requirements. */ global_size = groups_count * local_size; - -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - if (queue->get_device().is_host()) { - global_size = 1; - local_size = 1; - } -# endif } /* Let the compiler throw an error if there are any kernels missing in this implementation. */ diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h index 7456d0e4902..2bfc0b89c87 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.h +++ b/intern/cycles/kernel/device/oneapi/kernel.h @@ -48,6 +48,8 @@ CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context, int kernel, size_t global_size, void **args); +CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue, + const unsigned int requested_features); # ifdef __cplusplus } # endif diff --git a/intern/cycles/kernel/device/optix/bvh.h b/intern/cycles/kernel/device/optix/bvh.h index fb9907709ce..6d81b44660c 100644 --- a/intern/cycles/kernel/device/optix/bvh.h +++ b/intern/cycles/kernel/device/optix/bvh.h @@ -202,7 +202,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() /* Always use baked shadow transparency for curves. */ if (type & PRIMITIVE_CURVE) { float throughput = __uint_as_float(optixGetPayload_1()); - throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u); + throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, type, u); optixSetPayload_1(__float_as_uint(throughput)); optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1)); diff --git a/intern/cycles/kernel/integrator/mnee.h b/intern/cycles/kernel/integrator/mnee.h index 038f0379bbc..23885306885 100644 --- a/intern/cycles/kernel/integrator/mnee.h +++ b/intern/cycles/kernel/integrator/mnee.h @@ -279,7 +279,15 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg, } /* Compute constraint derivatives. */ -ccl_device_forceinline bool mnee_compute_constraint_derivatives( + +# if defined(__KERNEL_METAL__) +/* Temporary workaround for front-end compilation bug (incorrect MNEE rendering when this is + * inlined). */ +__attribute__((noinline)) +# else +ccl_device_forceinline +# endif +bool mnee_compute_constraint_derivatives( int vertex_count, ccl_private ManifoldVertex *vertices, ccl_private const float3 &surface_sample_pos, diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h index 4b03c665e17..40961b1c5fb 100644 --- a/intern/cycles/kernel/integrator/state_flow.h +++ b/intern/cycles/kernel/integrator/state_flow.h @@ -76,6 +76,9 @@ ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init( &kernel_integrator_state.next_shadow_path_index[0], 1); atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; +# ifdef __PATH_GUIDING__ + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, path_segment) = nullptr; +# endif return shadow_state; } @@ -181,6 +184,9 @@ ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init( { IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow; INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; +# ifdef __PATH_GUIDING__ + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, path_segment) = nullptr; +# endif return shadow_state; } diff --git a/intern/cycles/kernel/sample/pattern.h b/intern/cycles/kernel/sample/pattern.h index ebdecc1bff9..e12f333b3a5 100644 --- a/intern/cycles/kernel/sample/pattern.h +++ b/intern/cycles/kernel/sample/pattern.h @@ -100,7 +100,7 @@ ccl_device_inline bool sample_is_class_A(int pattern, int sample) if (!(pattern == SAMPLING_PATTERN_PMJ || pattern == SAMPLING_PATTERN_SOBOL_BURLEY)) { /* Fallback: assign samples randomly. * This is guaranteed to work "okay" for any sampler, but isn't good. - * (Note: the seed constant is just a random number to guard against + * (NOTE: the seed constant is just a random number to guard against * possible interactions with other uses of the hash. There's nothing * special about it.) */ diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 1469d915d15..8f7cfd19169 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -85,9 +85,9 @@ CCL_NAMESPACE_BEGIN # define __VOLUME_RECORD_ALL__ #endif /* !__KERNEL_GPU__ */ -/* MNEE currently causes "Compute function exceeds available temporary registers" - * on Metal, disabled for now. */ -#ifndef __KERNEL_METAL__ +/* MNEE caused "Compute function exceeds available temporary registers" in macOS < 13 due to a bug + * in spill buffer allocation sizing. */ +#if !defined(__KERNEL_METAL__) || (__KERNEL_METAL_MACOS__ >= 13) # define __MNEE__ #endif diff --git a/intern/cycles/scene/image_oiio.cpp b/intern/cycles/scene/image_oiio.cpp index 8792393e5a1..7bcf1ccb073 100644 --- a/intern/cycles/scene/image_oiio.cpp +++ b/intern/cycles/scene/image_oiio.cpp @@ -192,8 +192,22 @@ bool OIIOImageLoader::load_pixels(const ImageMetaData &metadata, return false; } - const bool do_associate_alpha = associate_alpha && - spec.get_int_attribute("oiio:UnassociatedAlpha", 0); + bool do_associate_alpha = false; + if (associate_alpha) { + do_associate_alpha = spec.get_int_attribute("oiio:UnassociatedAlpha", 0); + + if (!do_associate_alpha && spec.alpha_channel != -1) { + /* Workaround OIIO not detecting TGA file alpha the same as Blender (since #3019). + * We want anything not marked as premultiplied alpha to get associated. */ + if (strcmp(in->format_name(), "targa") == 0) { + do_associate_alpha = spec.get_int_attribute("targa:alpha_type", -1) != 4; + } + /* OIIO DDS reader never sets UnassociatedAlpha attribute. */ + if (strcmp(in->format_name(), "dds") == 0) { + do_associate_alpha = true; + } + } + } switch (metadata.type) { case IMAGE_DATA_TYPE_BYTE: diff --git a/intern/cycles/session/session.cpp b/intern/cycles/session/session.cpp index a0eb3196a34..acaa55f4990 100644 --- a/intern/cycles/session/session.cpp +++ b/intern/cycles/session/session.cpp @@ -43,6 +43,10 @@ Session::Session(const SessionParams ¶ms_, const SceneParams &scene_params) device = Device::create(params.device, stats, profiler); + if (device->have_error()) { + progress.set_error(device->error_message()); + } + scene = new Scene(scene_params, device); /* Configure path tracer. */ diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index 0905b3ec5c9..3a2e0e074a2 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -417,15 +417,11 @@ ccl_device_inline int floor_to_int(float f) return float_to_int(floorf(f)); } -ccl_device_inline int quick_floor_to_int(float x) -{ - return float_to_int(x) - ((x < 0) ? 1 : 0); -} - ccl_device_inline float floorfrac(float x, ccl_private int *i) { - *i = quick_floor_to_int(x); - return x - *i; + float f = floorf(x); + *i = float_to_int(f); + return x - f; } ccl_device_inline int ceil_to_int(float f) diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index c408eadf195..eec7122b9dc 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -535,18 +535,6 @@ ccl_device_inline float3 pow(float3 v, float e) return make_float3(powf(v.x, e), powf(v.y, e), powf(v.z, e)); } -ccl_device_inline int3 quick_floor_to_int3(const float3 a) -{ -#ifdef __KERNEL_SSE__ - int3 b = int3(_mm_cvttps_epi32(a.m128)); - int3 isneg = int3(_mm_castps_si128(_mm_cmplt_ps(a.m128, _mm_set_ps1(0.0f)))); - /* Unsaturated add 0xffffffff is the same as subtract -1. */ - return b + isneg; -#else - return make_int3(quick_floor_to_int(a.x), quick_floor_to_int(a.y), quick_floor_to_int(a.z)); -#endif -} - ccl_device_inline bool isfinite_safe(float3 v) { return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z); diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h index a2fff94303e..1e2bfa90354 100644 --- a/intern/cycles/util/ssef.h +++ b/intern/cycles/util/ssef.h @@ -5,6 +5,8 @@ #ifndef __UTIL_SSEF_H__ #define __UTIL_SSEF_H__ +#include <math.h> + #include "util/ssei.h" CCL_NAMESPACE_BEGIN @@ -521,7 +523,7 @@ __forceinline const ssef round_zero(const ssef &a) __forceinline const ssef floor(const ssef &a) { # ifdef __KERNEL_NEON__ - return vrndnq_f32(a); + return vrndmq_f32(a); # else return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF); # endif @@ -534,6 +536,12 @@ __forceinline const ssef ceil(const ssef &a) return _mm_round_ps(a, _MM_FROUND_TO_POS_INF); # endif } +# else +/* Non-SSE4.1 fallback, needed for floorfrac. */ +__forceinline const ssef floor(const ssef &a) +{ + return _mm_set_ps(floorf(a.f[3]), floorf(a.f[2]), floorf(a.f[1]), floorf(a.f[0])); +} # endif __forceinline ssei truncatei(const ssef &a) @@ -541,20 +549,11 @@ __forceinline ssei truncatei(const ssef &a) return _mm_cvttps_epi32(a.m128); } -/* This is about 25% faster than straightforward floor to integer conversion - * due to better pipelining. - * - * Unsaturated add 0xffffffff (a < 0) is the same as subtract -1. - */ -__forceinline ssei floori(const ssef &a) -{ - return truncatei(a) + cast((a < 0.0f).m128); -} - __forceinline ssef floorfrac(const ssef &x, ssei *i) { - *i = floori(x); - return x - ssef(*i); + ssef f = floor(x); + *i = truncatei(f); + return x - f; } //////////////////////////////////////////////////////////////////////////////// |