Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles')
-rw-r--r--intern/cycles/CMakeLists.txt6
-rw-r--r--intern/cycles/app/CMakeLists.txt5
-rw-r--r--intern/cycles/blender/addon/engine.py15
-rw-r--r--intern/cycles/blender/addon/properties.py1
-rw-r--r--intern/cycles/blender/pointcloud.cpp4
-rw-r--r--intern/cycles/blender/shader.cpp4
-rw-r--r--intern/cycles/blender/sync.cpp206
-rw-r--r--intern/cycles/cmake/external_libs.cmake56
-rw-r--r--intern/cycles/device/CMakeLists.txt22
-rw-r--r--intern/cycles/device/cuda/queue.cpp2
-rw-r--r--intern/cycles/device/cuda/queue.h2
-rw-r--r--intern/cycles/device/hip/queue.cpp2
-rw-r--r--intern/cycles/device/hip/queue.h2
-rw-r--r--intern/cycles/device/metal/device_impl.mm14
-rw-r--r--intern/cycles/device/metal/kernel.mm13
-rw-r--r--intern/cycles/device/metal/queue.h2
-rw-r--r--intern/cycles/device/metal/queue.mm51
-rw-r--r--intern/cycles/device/metal/util.mm6
-rw-r--r--intern/cycles/device/oneapi/device.cpp4
-rw-r--r--intern/cycles/device/oneapi/device_impl.cpp72
-rw-r--r--intern/cycles/device/oneapi/device_impl.h4
-rw-r--r--intern/cycles/device/oneapi/queue.cpp2
-rw-r--r--intern/cycles/device/oneapi/queue.h2
-rw-r--r--intern/cycles/device/queue.h2
-rw-r--r--intern/cycles/integrator/path_trace.cpp8
-rw-r--r--intern/cycles/integrator/path_trace_work.cpp4
-rw-r--r--intern/cycles/integrator/path_trace_work_cpu.cpp2
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.cpp31
-rw-r--r--intern/cycles/integrator/work_balancer.cpp3
-rw-r--r--intern/cycles/kernel/CMakeLists.txt47
-rw-r--r--intern/cycles/kernel/bvh/shadow_all.h2
-rw-r--r--intern/cycles/kernel/bvh/util.h8
-rw-r--r--intern/cycles/kernel/device/cpu/bvh.h2
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h33
-rw-r--r--intern/cycles/kernel/device/metal/context_begin.h37
-rw-r--r--intern/cycles/kernel/device/metal/kernel.metal2
-rw-r--r--intern/cycles/kernel/device/oneapi/compat.h45
-rw-r--r--intern/cycles/kernel/device/oneapi/globals.h9
-rw-r--r--intern/cycles/kernel/device/oneapi/kernel.cpp118
-rw-r--r--intern/cycles/kernel/device/oneapi/kernel.h2
-rw-r--r--intern/cycles/kernel/device/optix/bvh.h2
-rw-r--r--intern/cycles/kernel/integrator/mnee.h10
-rw-r--r--intern/cycles/kernel/integrator/state_flow.h6
-rw-r--r--intern/cycles/kernel/sample/pattern.h2
-rw-r--r--intern/cycles/kernel/types.h6
-rw-r--r--intern/cycles/scene/image_oiio.cpp18
-rw-r--r--intern/cycles/session/session.cpp4
-rw-r--r--intern/cycles/util/math.h10
-rw-r--r--intern/cycles/util/math_float3.h12
-rw-r--r--intern/cycles/util/ssef.h25
50 files changed, 508 insertions, 439 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index f619e6b104e..329aa3990f6 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -263,8 +263,7 @@ if(WITH_CYCLES_DEVICE_OPTIX)
${OPTIX_INCLUDE_DIR}
)
else()
- message(STATUS "OptiX not found, disabling it from Cycles")
- set(WITH_CYCLES_DEVICE_OPTIX OFF)
+ set_and_warn_library_found("OptiX" OPTIX_FOUND WITH_CYCLES_DEVICE_OPTIX)
endif()
endif()
@@ -387,8 +386,7 @@ if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_C_COMPILER_ID MATCHES "Clang")
endif()
if(WITH_CYCLES_HYDRA_RENDER_DELEGATE AND (NOT WITH_USD))
- message(STATUS "USD not found, disabling WITH_CYCLES_HYDRA_RENDER_DELEGATE")
- set(WITH_CYCLES_HYDRA_RENDER_DELEGATE OFF)
+ set_and_warn_library_found("USD" WITH_USD WITH_CYCLES_HYDRA_RENDER_DELEGATE)
endif()
if(WITH_CYCLES_HYDRA_RENDER_DELEGATE AND (NOT WITH_BLENDER) AND (NOT WITH_CYCLES_STANDALONE))
set(CYCLES_INSTALL_PATH ${CYCLES_INSTALL_PATH}/hdCycles/resources)
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 0988b1c0ac4..1c7a861ea93 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -43,7 +43,10 @@ else()
endif()
if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
- list(APPEND INC_SYS ${Epoxy_INCLUDE_DIRS} ${SDL2_INCLUDE_DIRS})
+ list(APPEND INC_SYS
+ ${Epoxy_INCLUDE_DIRS}
+ ${SDL2_INCLUDE_DIRS}
+ )
list(APPEND LIB ${Epoxy_LIBRARIES} ${SDL2_LIBRARIES})
endif()
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 794338fe78e..e33891fa7a2 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -209,22 +209,25 @@ def list_render_passes(scene, srl):
yield ("Debug Sample Count", "X", 'VALUE')
# Cryptomatte passes.
- crypto_depth = (srl.pass_cryptomatte_depth + 1) // 2
+ # NOTE: Name channels are lowercase RGBA so that compression rules check in OpenEXR DWA code
+ # uses lossless compression. Reportedly this naming is the only one which works good from the
+ # interoperability point of view. Using XYZW naming is not portable.
+ crypto_depth = (min(16, srl.pass_cryptomatte_depth) + 1) // 2
if srl.use_pass_cryptomatte_object:
for i in range(0, crypto_depth):
- yield ("CryptoObject" + '{:02d}'.format(i), "RGBA", 'COLOR')
+ yield ("CryptoObject" + '{:02d}'.format(i), "rgba", 'COLOR')
if srl.use_pass_cryptomatte_material:
for i in range(0, crypto_depth):
- yield ("CryptoMaterial" + '{:02d}'.format(i), "RGBA", 'COLOR')
+ yield ("CryptoMaterial" + '{:02d}'.format(i), "rgba", 'COLOR')
if srl.use_pass_cryptomatte_asset:
for i in range(0, crypto_depth):
- yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR')
+ yield ("CryptoAsset" + '{:02d}'.format(i), "rgba", 'COLOR')
# Denoising passes.
if scene.cycles.use_denoising and crl.use_denoising:
yield ("Noisy Image", "RGBA", 'COLOR')
if crl.use_pass_shadow_catcher:
- yield ("Noisy Shadow Catcher", "RGBA", 'COLOR')
+ yield ("Noisy Shadow Catcher", "RGB", 'COLOR')
if crl.denoising_store_passes:
yield ("Denoising Normal", "XYZ", 'VECTOR')
yield ("Denoising Albedo", "RGB", 'COLOR')
@@ -232,6 +235,8 @@ def list_render_passes(scene, srl):
# Custom AOV passes.
for aov in srl.aovs:
+ if not aov.is_valid:
+ continue
if aov.type == 'VALUE':
yield (aov.name, "X", 'VALUE')
else:
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index b7ce76d8f44..425d123e9e6 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1651,6 +1651,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
box.prop(
device, "use", text=device.name
.replace('(TM)', unicodedata.lookup('TRADE MARK SIGN'))
+ .replace('(tm)', unicodedata.lookup('TRADE MARK SIGN'))
.replace('(R)', unicodedata.lookup('REGISTERED SIGN'))
.replace('(C)', unicodedata.lookup('COPYRIGHT SIGN'))
)
diff --git a/intern/cycles/blender/pointcloud.cpp b/intern/cycles/blender/pointcloud.cpp
index 35be2916e43..a679a92d997 100644
--- a/intern/cycles/blender/pointcloud.cpp
+++ b/intern/cycles/blender/pointcloud.cpp
@@ -194,7 +194,7 @@ static void export_pointcloud(Scene *scene,
/* Export points. */
for (int i = 0; i < num_points; i++) {
const float3 co = get_float3(b_attr_position.data[i].vector());
- const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.0f;
+ const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.01f;
pointcloud->add_point(co, radius);
/* Random number per point. */
@@ -232,7 +232,7 @@ static void export_pointcloud_motion(PointCloud *pointcloud,
for (int i = 0; i < std::min(num_points, b_points_num); i++) {
const float3 co = get_float3(b_attr_position.data[i].vector());
- const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.0f;
+ const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.01f;
float3 P = co;
P.w = radius;
mP[i] = P;
diff --git a/intern/cycles/blender/shader.cpp b/intern/cycles/blender/shader.cpp
index fd32e7ca1d7..dbc49df7f22 100644
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -215,7 +215,9 @@ static void set_default_value(ShaderInput *input,
}
case SocketType::INT: {
if (b_sock.type() == BL::NodeSocket::type_BOOLEAN) {
- node->set(socket, get_boolean(b_sock.ptr, "default_value"));
+ /* Make sure to call the int overload of set() since this is an integer socket as far as
+ * Cycles is concerned. */
+ node->set(socket, get_boolean(b_sock.ptr, "default_value") ? 1 : 0);
}
else {
node->set(socket, get_int(b_sock.ptr, "default_value"));
diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp
index a69a94614d3..5251f0fee9c 100644
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -575,68 +575,72 @@ void BlenderSync::sync_images()
/* Passes */
-static PassType get_blender_pass_type(BL::RenderPass &b_pass)
+static bool get_known_pass_type(BL::RenderPass &b_pass, PassType &type, PassMode &mode)
{
string name = b_pass.name();
-#define MAP_PASS(passname, passtype) \
+#define MAP_PASS(passname, passtype, noisy) \
if (name == passname) { \
- return passtype; \
+ type = passtype; \
+ mode = (noisy) ? PassMode::NOISY : PassMode::DENOISED; \
+ return true; \
} \
((void)0)
- /* NOTE: Keep in sync with defined names from DNA_scene_types.h */
+ /* NOTE: Keep in sync with defined names from engine.py */
- MAP_PASS("Combined", PASS_COMBINED);
- MAP_PASS("Noisy Image", PASS_COMBINED);
+ MAP_PASS("Combined", PASS_COMBINED, false);
+ MAP_PASS("Noisy Image", PASS_COMBINED, true);
- MAP_PASS("Depth", PASS_DEPTH);
- MAP_PASS("Mist", PASS_MIST);
- MAP_PASS("Position", PASS_POSITION);
- MAP_PASS("Normal", PASS_NORMAL);
- MAP_PASS("IndexOB", PASS_OBJECT_ID);
- MAP_PASS("UV", PASS_UV);
- MAP_PASS("Vector", PASS_MOTION);
- MAP_PASS("IndexMA", PASS_MATERIAL_ID);
+ MAP_PASS("Depth", PASS_DEPTH, false);
+ MAP_PASS("Mist", PASS_MIST, false);
+ MAP_PASS("Position", PASS_POSITION, false);
+ MAP_PASS("Normal", PASS_NORMAL, false);
+ MAP_PASS("IndexOB", PASS_OBJECT_ID, false);
+ MAP_PASS("UV", PASS_UV, false);
+ MAP_PASS("Vector", PASS_MOTION, false);
+ MAP_PASS("IndexMA", PASS_MATERIAL_ID, false);
- MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT);
- MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT);
- MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT);
- MAP_PASS("VolumeDir", PASS_VOLUME_DIRECT);
+ MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT, false);
+ MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT, false);
+ MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT, false);
+ MAP_PASS("VolumeDir", PASS_VOLUME_DIRECT, false);
- MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT);
- MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT);
- MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT);
- MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT);
+ MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT, false);
+ MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT, false);
+ MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT, false);
+ MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT, false);
- MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR);
- MAP_PASS("GlossCol", PASS_GLOSSY_COLOR);
- MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR);
+ MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR, false);
+ MAP_PASS("GlossCol", PASS_GLOSSY_COLOR, false);
+ MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR, false);
- MAP_PASS("Emit", PASS_EMISSION);
- MAP_PASS("Env", PASS_BACKGROUND);
- MAP_PASS("AO", PASS_AO);
- MAP_PASS("Shadow", PASS_SHADOW);
+ MAP_PASS("Emit", PASS_EMISSION, false);
+ MAP_PASS("Env", PASS_BACKGROUND, false);
+ MAP_PASS("AO", PASS_AO, false);
+ MAP_PASS("Shadow", PASS_SHADOW, false);
- MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE);
- MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL);
+ MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE, false);
+ MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL, false);
- MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL);
- MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO);
- MAP_PASS("Denoising Depth", PASS_DENOISING_DEPTH);
+ MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL, true);
+ MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO, true);
+ MAP_PASS("Denoising Depth", PASS_DENOISING_DEPTH, true);
- MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER);
- MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER);
+ MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER, false);
+ MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER, true);
- MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
- MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
+ MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER, false);
+ MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT, false);
if (string_startswith(name, cryptomatte_prefix)) {
- return PASS_CRYPTOMATTE;
+ type = PASS_CRYPTOMATTE;
+ mode = PassMode::DENOISED;
+ return true;
}
#undef MAP_PASS
- return PASS_NONE;
+ return false;
}
static Pass *pass_add(Scene *scene,
@@ -655,8 +659,6 @@ static Pass *pass_add(Scene *scene,
void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
{
- PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-
/* Delete all existing passes. */
set<Pass *> clear_passes(scene->passes.begin(), scene->passes.end());
scene->delete_nodes(clear_passes);
@@ -664,103 +666,23 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v
/* Always add combined pass. */
pass_add(scene, PASS_COMBINED, "Combined");
- /* Blender built-in data and light passes. */
- for (BL::RenderPass &b_pass : b_rlay.passes) {
- const PassType pass_type = get_blender_pass_type(b_pass);
-
- if (pass_type == PASS_NONE) {
- LOG(ERROR) << "Unknown pass " << b_pass.name();
- continue;
- }
-
- if (pass_type == PASS_MOTION &&
- (b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) {
- continue;
- }
-
- pass_add(scene, pass_type, b_pass.name().c_str());
- }
-
- PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
-
- /* Debug passes. */
- if (get_boolean(crl, "pass_debug_sample_count")) {
- b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
- pass_add(scene, PASS_SAMPLE_COUNT, "Debug Sample Count");
- }
-
- /* Cycles specific passes. */
- if (get_boolean(crl, "use_pass_volume_direct")) {
- b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
- pass_add(scene, PASS_VOLUME_DIRECT, "VolumeDir");
- }
- if (get_boolean(crl, "use_pass_volume_indirect")) {
- b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str());
- pass_add(scene, PASS_VOLUME_INDIRECT, "VolumeInd");
- }
- if (get_boolean(crl, "use_pass_shadow_catcher")) {
- b_engine.add_pass("Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
- pass_add(scene, PASS_SHADOW_CATCHER, "Shadow Catcher");
- }
-
/* Cryptomatte stores two ID/weight pairs per RGBA layer.
- * User facing parameter is the number of pairs.
- *
- * NOTE: Name channels lowercase RGBA so that compression rules check in OpenEXR DWA code uses
- * lossless compression. Reportedly this naming is the only one which works good from the
- * interoperability point of view. Using XYZW naming is not portable. */
+ * User facing parameter is the number of pairs. */
int crypto_depth = divide_up(min(16, b_view_layer.pass_cryptomatte_depth()), 2);
scene->film->set_cryptomatte_depth(crypto_depth);
CryptomatteType cryptomatte_passes = CRYPT_NONE;
if (b_view_layer.use_pass_cryptomatte_object()) {
- for (int i = 0; i < crypto_depth; i++) {
- string passname = cryptomatte_prefix + string_printf("Object%02d", i);
- b_engine.add_pass(passname.c_str(), 4, "rgba", b_view_layer.name().c_str());
- pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
- }
cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_OBJECT);
}
if (b_view_layer.use_pass_cryptomatte_material()) {
- for (int i = 0; i < crypto_depth; i++) {
- string passname = cryptomatte_prefix + string_printf("Material%02d", i);
- b_engine.add_pass(passname.c_str(), 4, "rgba", b_view_layer.name().c_str());
- pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
- }
cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_MATERIAL);
}
if (b_view_layer.use_pass_cryptomatte_asset()) {
- for (int i = 0; i < crypto_depth; i++) {
- string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
- b_engine.add_pass(passname.c_str(), 4, "rgba", b_view_layer.name().c_str());
- pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
- }
cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ASSET);
}
scene->film->set_cryptomatte_passes(cryptomatte_passes);
- /* Denoising passes. */
- const bool use_denoising = get_boolean(cscene, "use_denoising") &&
- get_boolean(crl, "use_denoising");
- const bool store_denoising_passes = get_boolean(crl, "denoising_store_passes");
- if (use_denoising) {
- b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
- pass_add(scene, PASS_COMBINED, "Noisy Image", PassMode::NOISY);
- if (get_boolean(crl, "use_pass_shadow_catcher")) {
- b_engine.add_pass("Noisy Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
- pass_add(scene, PASS_SHADOW_CATCHER, "Noisy Shadow Catcher", PassMode::NOISY);
- }
- }
- if (store_denoising_passes) {
- b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
- pass_add(scene, PASS_DENOISING_NORMAL, "Denoising Normal", PassMode::NOISY);
-
- b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
- pass_add(scene, PASS_DENOISING_ALBEDO, "Denoising Albedo", PassMode::NOISY);
-
- b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str());
- pass_add(scene, PASS_DENOISING_DEPTH, "Denoising Depth", PassMode::NOISY);
- }
-
+ /* Path guiding debug passes. */
#ifdef WITH_CYCLES_DEBUG
b_engine.add_pass("Guiding Color", 3, "RGB", b_view_layer.name().c_str());
pass_add(scene, PASS_GUIDING_COLOR, "Guiding Color", PassMode::NOISY);
@@ -772,6 +694,8 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v
pass_add(scene, PASS_GUIDING_AVG_ROUGHNESS, "Guiding Average Roughness", PassMode::NOISY);
#endif
+ unordered_set<string> expected_passes;
+
/* Custom AOV passes. */
BL::ViewLayer::aovs_iterator b_aov_iter;
for (b_view_layer.aovs.begin(b_aov_iter); b_aov_iter != b_view_layer.aovs.end(); ++b_aov_iter) {
@@ -781,16 +705,10 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v
}
string name = b_aov.name();
- bool is_color = b_aov.type() == BL::AOV::type_COLOR;
+ PassType type = (b_aov.type() == BL::AOV::type_COLOR) ? PASS_AOV_COLOR : PASS_AOV_VALUE;
- if (is_color) {
- b_engine.add_pass(name.c_str(), 4, "RGBA", b_view_layer.name().c_str());
- pass_add(scene, PASS_AOV_COLOR, name.c_str());
- }
- else {
- b_engine.add_pass(name.c_str(), 1, "X", b_view_layer.name().c_str());
- pass_add(scene, PASS_AOV_VALUE, name.c_str());
- }
+ pass_add(scene, type, name.c_str());
+ expected_passes.insert(name);
}
/* Light Group passes. */
@@ -802,9 +720,29 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v
string name = string_printf("Combined_%s", b_lightgroup.name().c_str());
- b_engine.add_pass(name.c_str(), 3, "RGB", b_view_layer.name().c_str());
Pass *pass = pass_add(scene, PASS_COMBINED, name.c_str(), PassMode::NOISY);
pass->set_lightgroup(ustring(b_lightgroup.name()));
+ expected_passes.insert(name);
+ }
+
+ /* Sync the passes that were defined in engine.py. */
+ for (BL::RenderPass &b_pass : b_rlay.passes) {
+ PassType pass_type = PASS_NONE;
+ PassMode pass_mode = PassMode::DENOISED;
+
+ if (!get_known_pass_type(b_pass, pass_type, pass_mode)) {
+ if (!expected_passes.count(b_pass.name())) {
+ LOG(ERROR) << "Unknown pass " << b_pass.name();
+ }
+ continue;
+ }
+
+ if (pass_type == PASS_MOTION &&
+ (b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) {
+ continue;
+ }
+
+ pass_add(scene, pass_type, b_pass.name().c_str(), pass_mode);
}
scene->film->set_pass_alpha_threshold(b_view_layer.pass_alpha_threshold());
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 9524cda54f5..44542a08156 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -289,8 +289,7 @@ if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_PATH_GUIDING)
endif()
get_target_property(OPENPGL_INCLUDE_DIR openpgl::openpgl INTERFACE_INCLUDE_DIRECTORIES)
else()
- set(WITH_CYCLES_PATH_GUIDING OFF)
- message(STATUS "OpenPGL not found, disabling WITH_CYCLES_PATH_GUIDING")
+ set_and_warn_library_found("OpenPGL" openpgl_FOUND WITH_CYCLES_PATH_GUIDING)
endif()
endif()
@@ -588,16 +587,14 @@ if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
# We can't use the version from the Blender precompiled libraries because
# it does not include the video subsystem.
find_package(SDL2 REQUIRED)
+ set_and_warn_library_found("SDL" SDL2_FOUND WITH_CYCLES_STANDALONE_GUI)
- if(NOT SDL2_FOUND)
- set(WITH_CYCLES_STANDALONE_GUI OFF)
- message(STATUS "SDL not found, disabling Cycles standalone GUI")
+ if(SDL2_FOUND)
+ include_directories(
+ SYSTEM
+ ${SDL2_INCLUDE_DIRS}
+ )
endif()
-
- include_directories(
- SYSTEM
- ${SDL2_INCLUDE_DIRS}
- )
endif()
###########################################################################
@@ -606,11 +603,11 @@ endif()
if(WITH_CYCLES_DEVICE_CUDA AND (WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD))
find_package(CUDA) # Try to auto locate CUDA toolkit
+ set_and_warn_library_found("CUDA compiler" CUDA_FOUND WITH_CYCLES_CUDA_BINARIES)
+
if(CUDA_FOUND)
message(STATUS "Found CUDA ${CUDA_NVCC_EXECUTABLE} (${CUDA_VERSION})")
else()
- message(STATUS "CUDA compiler not found, disabling WITH_CYCLES_CUDA_BINARIES")
- set(WITH_CYCLES_CUDA_BINARIES OFF)
if(NOT WITH_CUDA_DYNLOAD)
message(STATUS "Additionally falling back to dynamic CUDA load")
set(WITH_CUDA_DYNLOAD ON)
@@ -624,11 +621,10 @@ endif()
if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
find_package(HIP)
+ set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES)
+
if(HIP_FOUND)
message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})")
- else()
- message(STATUS "HIP compiler not found, disabling WITH_CYCLES_HIP_BINARIES")
- set(WITH_CYCLES_HIP_BINARIES OFF)
endif()
endif()
@@ -644,13 +640,17 @@ if(WITH_CYCLES_DEVICE_METAL)
find_library(METAL_LIBRARY Metal)
# This file was added in the 12.0 SDK, use it as a way to detect the version.
- if(METAL_LIBRARY AND NOT EXISTS "${METAL_LIBRARY}/Headers/MTLFunctionStitching.h")
- message(STATUS "Metal version too old, must be SDK 12.0 or newer, disabling WITH_CYCLES_DEVICE_METAL")
- set(WITH_CYCLES_DEVICE_METAL OFF)
- elseif(NOT METAL_LIBRARY)
- message(STATUS "Metal not found, disabling WITH_CYCLES_DEVICE_METAL")
- set(WITH_CYCLES_DEVICE_METAL OFF)
- else()
+ if(METAL_LIBRARY)
+ if(EXISTS "${METAL_LIBRARY}/Headers/MTLFunctionStitching.h")
+ set(METAL_FOUND ON)
+ else()
+ message(STATUS "Metal version too old, must be SDK 12.0 or newer")
+ set(METAL_FOUND OFF)
+ endif()
+ endif()
+
+ set_and_warn_library_found("Metal" METAL_FOUND WITH_CYCLES_DEVICE_METAL)
+ if(METAL_FOUND)
message(STATUS "Found Metal: ${METAL_LIBRARY}")
endif()
endif()
@@ -662,9 +662,10 @@ endif()
if(WITH_CYCLES_DEVICE_ONEAPI)
find_package(SYCL)
find_package(LevelZero)
+ set_and_warn_library_found("oneAPI" SYCL_FOUND WITH_CYCLES_DEVICE_ONEAPI)
+ set_and_warn_library_found("Level Zero" LEVEL_ZERO_FOUND WITH_CYCLES_DEVICE_ONEAPI)
- if(SYCL_FOUND AND LEVEL_ZERO_FOUND)
- message(STATUS "Found oneAPI: ${SYCL_LIBRARY}")
+ if(SYCL_FOUND AND SYCL_VERSION VERSION_GREATER_EQUAL 6.0 AND LEVEL_ZERO_FOUND)
message(STATUS "Found Level Zero: ${LEVEL_ZERO_LIBRARY}")
if(WITH_CYCLES_ONEAPI_BINARIES)
@@ -675,13 +676,14 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
endif()
if(NOT EXISTS ${OCLOC_INSTALL_DIR})
- message(STATUS "oneAPI ocloc not found in ${OCLOC_INSTALL_DIR}, disabling WITH_CYCLES_ONEAPI_BINARIES."
+ set(OCLOC_FOUND OFF)
+ message(STATUS "oneAPI ocloc not found in ${OCLOC_INSTALL_DIR}."
" A different ocloc directory can be set using OCLOC_INSTALL_DIR cmake variable.")
- set(WITH_CYCLES_ONEAPI_BINARIES OFF)
+ set_and_warn_library_found("ocloc" OCLOC_FOUND WITH_CYCLES_ONEAPI_BINARIES)
endif()
endif()
else()
- message(STATUS "oneAPI or Level Zero not found, disabling WITH_CYCLES_DEVICE_ONEAPI")
+ message(STATUS "SYCL 6.0+ or Level Zero not found, disabling WITH_CYCLES_DEVICE_ONEAPI")
set(WITH_CYCLES_DEVICE_ONEAPI OFF)
endif()
endif()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 5516e97f34f..5296d819e42 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -187,18 +187,22 @@ if(WITH_CYCLES_DEVICE_METAL)
)
endif()
if (WITH_CYCLES_DEVICE_ONEAPI)
+ if(WITH_CYCLES_ONEAPI_BINARIES)
+ set(cycles_kernel_oneapi_lib_suffix "_aot")
+ else()
+ set(cycles_kernel_oneapi_lib_suffix "_jit")
+ endif()
if(WIN32)
- set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi.lib)
+ set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.lib)
else()
- set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi.so)
+ set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.so)
+ endif()
+ list(APPEND LIB ${cycles_kernel_oneapi_lib})
+ if(WIN32)
+ list(APPEND LIB debug ${SYCL_LIBRARY_DEBUG} optimized ${SYCL_LIBRARY})
+ else()
+ list(APPEND LIB ${SYCL_LIBRARY})
endif()
- list(APPEND LIB
- ${cycles_kernel_oneapi_lib}
- "$<$<CONFIG:Debug>:${SYCL_LIBRARY_DEBUG}>"
- "$<$<CONFIG:Release>:${SYCL_LIBRARY}>"
- "$<$<CONFIG:RelWithDebInfo>:${SYCL_LIBRARY}>"
- "$<$<CONFIG:MinSizeRel>:${SYCL_LIBRARY}>"
- )
add_definitions(-DWITH_ONEAPI)
list(APPEND SRC
${SRC_ONEAPI}
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 84b0a1e0dd6..69fae03e32c 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
return num_states;
}
-int CUDADeviceQueue::num_concurrent_busy_states() const
+int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
{
const int max_num_threads = cuda_device_->get_num_multiprocessors() *
cuda_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index b450f5b3592..7107afe70c9 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue {
~CUDADeviceQueue();
virtual int num_concurrent_states(const size_t state_size) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t state_size) const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 3f8b6267100..e93a9b4df3a 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
return num_states;
}
-int HIPDeviceQueue::num_concurrent_busy_states() const
+int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
{
const int max_num_threads = hip_device_->get_num_multiprocessors() *
hip_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
index 729d8a19acb..df0678108af 100644
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue {
~HIPDeviceQueue();
virtual int num_concurrent_states(const size_t state_size) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t state_size) const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index d1250b83d22..6f1042b1e55 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -254,6 +254,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
break;
}
+ NSProcessInfo *processInfo = [NSProcessInfo processInfo];
+ NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion];
+ global_defines += "#define __KERNEL_METAL_MACOS__ " + to_string(macos_ver.majorVersion) + "\n";
+
string &source = this->source[pso_type];
source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
source = path_source_replace_includes(source, path_get("source"));
@@ -292,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
}
source = global_defines + source;
+# if 0
metal_printf("================\n%s================\n\%s================\n",
global_defines.c_str(),
baked_constants.c_str());
+# endif
/* Generate an MD5 from the source and include any baked constants. This is used when caching
* PSOs. */
@@ -335,6 +341,14 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
+# if defined(MAC_OS_VERSION_13_0)
+ if (@available(macos 13.0, *)) {
+ if (device_vendor == METAL_GPU_INTEL) {
+ [options setOptimizationLevel:MTLLibraryOptimizationLevelSize];
+ }
+ }
+# endif
+
options.fastMathEnabled = YES;
if (@available(macOS 12.0, *)) {
options.languageVersion = MTLLanguageVersion2_4;
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 5e0cb6d18f4..55938d1a03a 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -162,6 +162,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
}
}
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+ if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) {
+ /* Skip shade_surface_mnee kernel if the scene doesn't require it. */
+ return false;
+ }
+ }
+
if (pso_type != PSO_GENERIC) {
/* Only specialize kernels where it can make an impact. */
if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
@@ -317,6 +324,12 @@ bool MetalKernelPipeline::should_use_binary_archive() const
}
}
+ /* Workaround for Intel GPU having issue using Binary Archives */
+ MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice);
+ if (gpu_vendor == METAL_GPU_INTEL) {
+ return false;
+ }
+
if (pso_type == PSO_GENERIC) {
/* Archive the generic kernels. */
return true;
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index fc32740f3e1..2a6c12e2a60 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue {
~MetalDeviceQueue();
virtual int num_concurrent_states(const size_t) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t) const override;
virtual int num_sort_partition_elements() const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 5ac63a16c61..c0df2c8553f 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue()
}
}
-int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
{
- /* METAL_WIP */
- /* TODO: compute automatically. */
- /* TODO: must have at least num_threads_per_block. */
- int result = 1048576;
- if (metal_device_->device_vendor == METAL_GPU_AMD) {
- result *= 2;
+ static int result = 0;
+ if (result) {
+ return result;
}
- else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
+
+ result = 1048576;
+ if (metal_device_->device_vendor == METAL_GPU_APPLE) {
result *= 4;
+
+ if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) {
+ size_t system_ram = system_physical_ram();
+ size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
+ size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
+
+ /* Determine whether we can double the state count, and leave enough GPU-available memory
+ * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
+ * keep dispatch sizes high and minimize work submission overheads. */
+ size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
+ size_t total_state_size = result * state_size;
+ if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
+ result *= 2;
+ metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
+ }
+ }
+ }
+ else if (metal_device_->device_vendor == METAL_GPU_AMD) {
+ /* METAL_WIP */
+ /* TODO: compute automatically. */
+ /* TODO: must have at least num_threads_per_block. */
+ result *= 2;
}
return result;
}
-int MetalDeviceQueue::num_concurrent_busy_states() const
+int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
{
- /* METAL_WIP */
- /* TODO: compute automatically. */
- int result = 65536;
- if (metal_device_->device_vendor == METAL_GPU_AMD) {
- result *= 2;
- }
- else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
- result *= 4;
- }
- return result;
+ /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */
+ return num_concurrent_states(state_size) / 4;
}
int MetalDeviceQueue::num_sort_partition_elements() const
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index 65c67c400fe..f47638fac15 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -110,6 +110,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
usable |= (vendor == METAL_GPU_AMD);
}
+# if defined(MAC_OS_VERSION_13_0)
+ if (@available(macos 13.0, *)) {
+ usable |= (vendor == METAL_GPU_INTEL);
+ }
+# endif
+
if (usable) {
metal_printf("- %s\n", device_name.c_str());
[device retain];
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
index f303ab41627..66d6f749e30 100644
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -39,7 +39,7 @@ bool device_oneapi_init()
_putenv_s("SYCL_CACHE_THRESHOLD", "0");
}
if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
- _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
+ _putenv_s("SYCL_DEVICE_FILTER", "level_zero");
}
if (getenv("SYCL_ENABLE_PCI") == nullptr) {
_putenv_s("SYCL_ENABLE_PCI", "1");
@@ -50,7 +50,7 @@ bool device_oneapi_init()
# elif __linux__
setenv("SYCL_CACHE_PERSISTENT", "1", false);
setenv("SYCL_CACHE_THRESHOLD", "0", false);
- setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
+ setenv("SYCL_DEVICE_FILTER", "level_zero", false);
setenv("SYCL_ENABLE_PCI", "1", false);
setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
# endif
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index 2df605fa047..3588b75713b 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -43,7 +43,7 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
}
size_t globals_segment_size;
- is_finished_ok = kernel_globals_size(device_queue_, globals_segment_size);
+ is_finished_ok = kernel_globals_size(globals_segment_size);
if (is_finished_ok == false) {
set_error("oneAPI constant memory initialization got runtime exception \"" +
oneapi_error_string_ + "\"");
@@ -88,18 +88,26 @@ BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const
bool OneapiDevice::load_kernels(const uint requested_features)
{
assert(device_queue_);
- /* NOTE(@nsirgien): oneAPI can support compilation of kernel code with certain feature set
- * with specialization constants, but it hasn't been implemented yet. */
- (void)requested_features;
bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
if (is_finished_ok == false) {
- set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\"");
+ set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ +
+ "\"");
+ return false;
}
else {
- VLOG_INFO << "Runtime compilation done for \"" << info.description << "\"";
+ VLOG_INFO << "Test kernel has been executed successfully for \"" << info.description << "\"";
assert(device_queue_);
}
+
+ is_finished_ok = oneapi_load_kernels(device_queue_, (const unsigned int)requested_features);
+ if (is_finished_ok == false) {
+ set_error("oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ + "\"");
+ }
+ else {
+ VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\"";
+ }
+
return is_finished_ok;
}
@@ -422,9 +430,14 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
(void)usm_type;
assert(usm_type == sycl::usm::alloc::device ||
- ((device_type == sycl::info::device_type::host ||
- device_type == sycl::info::device_type::cpu || allow_host) &&
- usm_type == sycl::usm::alloc::host));
+ ((device_type == sycl::info::device_type::cpu || allow_host) &&
+ usm_type == sycl::usm::alloc::host ||
+ usm_type == sycl::usm::alloc::unknown));
+# else
+ /* Silence warning about unused arguments. */
+ (void)queue_;
+ (void)usm_ptr;
+ (void)allow_host;
# endif
}
@@ -552,7 +565,7 @@ bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
}
}
-bool OneapiDevice::kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
+bool OneapiDevice::kernel_globals_size(size_t &kernel_global_size)
{
kernel_global_size = sizeof(KernelGlobalsGPU);
@@ -658,14 +671,6 @@ std::vector<sycl::device> OneapiDevice::available_devices()
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
allow_all_devices = true;
- /* Host device is useful only for debugging at the moment
- * so we hide this device with default build settings. */
-# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
- bool allow_host = true;
-# else
- bool allow_host = false;
-# endif
-
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
std::vector<sycl::device> available_devices;
@@ -677,17 +682,11 @@ std::vector<sycl::device> OneapiDevice::available_devices()
}
const std::vector<sycl::device> &oneapi_devices =
- (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
- platform.get_devices(sycl::info::device_type::gpu);
+ (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
+ platform.get_devices(sycl::info::device_type::gpu);
for (const sycl::device &device : oneapi_devices) {
- if (allow_all_devices) {
- /* still filter out host device if build doesn't support it. */
- if (allow_host || !device.is_host()) {
- available_devices.push_back(device);
- }
- }
- else {
+ if (!allow_all_devices) {
bool filter_out = false;
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
@@ -699,11 +698,11 @@ std::vector<sycl::device> OneapiDevice::available_devices()
int number_of_eus = 96;
int threads_per_eu = 7;
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
- number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+ number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
}
if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
threads_per_eu =
- device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+ device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
}
/* This filters out all Level-Zero supported GPUs from older generation than Arc. */
if (number_of_eus <= 96 && threads_per_eu == 7) {
@@ -719,9 +718,6 @@ std::vector<sycl::device> OneapiDevice::available_devices()
}
}
}
- else if (!allow_host && device.is_host()) {
- filter_out = true;
- }
else if (!allow_all_devices) {
filter_out = true;
}
@@ -784,9 +780,7 @@ char *OneapiDevice::device_capabilities()
GET_NUM_ATTR(native_vector_width_double)
GET_NUM_ATTR(native_vector_width_half)
- size_t max_clock_frequency =
- (size_t)(device.is_host() ? (size_t)0 :
- device.get_info<sycl::info::device::max_clock_frequency>());
+ size_t max_clock_frequency = device.get_info<sycl::info::device::max_clock_frequency>();
WRITE_ATTR("max_clock_frequency", max_clock_frequency)
GET_NUM_ATTR(address_bits)
@@ -824,7 +818,7 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
std::string name = device.get_info<sycl::info::device::name>();
std::string id = "ONEAPI_" + platform_name + "_" + name;
if (device.has(sycl::aspect::ext_intel_pci_address)) {
- id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
+ id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
}
(cb)(id.c_str(), name.c_str(), num, user_ptr);
num++;
@@ -842,7 +836,7 @@ int OneapiDevice::get_num_multiprocessors()
{
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
- return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+ return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
}
else
return 0;
@@ -853,8 +847,8 @@ int OneapiDevice::get_max_num_threads_per_multiprocessor()
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
- return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
- device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+ return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() *
+ device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
}
else
return 0;
diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h
index 3589e881a6e..197cf03d60d 100644
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -3,7 +3,7 @@
#ifdef WITH_ONEAPI
-# include <CL/sycl.hpp>
+# include <sycl/sycl.hpp>
# include "device/device.h"
# include "device/oneapi/device.h"
@@ -104,7 +104,7 @@ class OneapiDevice : public Device {
int get_num_multiprocessors();
int get_max_num_threads_per_multiprocessor();
bool queue_synchronize(SyclQueue *queue);
- bool kernel_globals_size(SyclQueue *queue, size_t &kernel_global_size);
+ bool kernel_globals_size(size_t &kernel_global_size);
void set_global_memory(SyclQueue *queue,
void *kernel_globals,
const char *memory_name,
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
index 9632b14d485..3d019661aa8 100644
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -43,7 +43,7 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
return num_states;
}
-int OneapiDeviceQueue::num_concurrent_busy_states() const
+int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
{
const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
oneapi_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h
index 32363bf2a6e..bbd947b49cb 100644
--- a/intern/cycles/device/oneapi/queue.h
+++ b/intern/cycles/device/oneapi/queue.h
@@ -25,7 +25,7 @@ class OneapiDeviceQueue : public DeviceQueue {
virtual int num_concurrent_states(const size_t state_size) const override;
- virtual int num_concurrent_busy_states() const override;
+ virtual int num_concurrent_busy_states(const size_t state_size) const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 1d6a8d736b7..e27e081a407 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -103,7 +103,7 @@ class DeviceQueue {
/* Number of states which keeps the device occupied with work without losing performance.
* The renderer will add more work (when available) when number of active paths falls below this
* value. */
- virtual int num_concurrent_busy_states() const = 0;
+ virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
/* Number of elements in a partition of sorted shaders, that improves memory locality of
* integrator state fetch at the cost of decreased coherence for shader kernel execution. */
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 6b033cfd051..8e8fbd86be0 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -43,8 +43,11 @@ PathTrace::PathTrace(Device *device,
/* Create path tracing work in advance, so that it can be reused by incremental sampling as much
* as possible. */
device_->foreach_device([&](Device *path_trace_device) {
- path_trace_works_.emplace_back(PathTraceWork::create(
- path_trace_device, film, device_scene, &render_cancel_.is_requested));
+ unique_ptr<PathTraceWork> work = PathTraceWork::create(
+ path_trace_device, film, device_scene, &render_cancel_.is_requested);
+ if (work) {
+ path_trace_works_.emplace_back(std::move(work));
+ }
});
work_balance_infos_.resize(path_trace_works_.size());
@@ -1293,6 +1296,7 @@ void PathTrace::set_guiding_params(const GuidingParams &guiding_params, const bo
# if OPENPGL_VERSION_MINOR >= 4
field_args.deterministic = guiding_params.deterministic;
# endif
+ reinterpret_cast<PGLKDTreeArguments *>(field_args.spatialSturctureArguments)->maxDepth = 16;
openpgl::cpp::Device *guiding_device = static_cast<openpgl::cpp::Device *>(
device_->get_guiding_device());
if (guiding_device) {
diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp
index bb5c6e1a61a..a5f98b5475a 100644
--- a/intern/cycles/integrator/path_trace_work.cpp
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -23,6 +23,10 @@ unique_ptr<PathTraceWork> PathTraceWork::create(Device *device,
if (device->info.type == DEVICE_CPU) {
return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag);
}
+ if (device->info.type == DEVICE_DUMMY) {
+ /* Dummy devices can't perform any work. */
+ return nullptr;
+ }
return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag);
}
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index d5ac830db58..188ec28cf65 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -285,7 +285,7 @@ void PathTraceWorkCPU::cryptomatte_postproces()
}
#ifdef WITH_PATH_GUIDING
-/* Note: It seems that this is called before every rendering iteration/progression and not once per
+/* NOTE: It seems that this is called before every rendering iteration/progression and not once per
* rendering. May be we find a way to call it only once per rendering. */
void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
void *sample_data_storage,
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index ee250a6916b..48f6cf3c903 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -18,13 +18,15 @@
CCL_NAMESPACE_BEGIN
-static size_t estimate_single_state_size()
+static size_t estimate_single_state_size(const uint kernel_features)
{
size_t state_size = 0;
#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
-#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
-#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+ state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+ state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
#define KERNEL_STRUCT_END(name) \
break; \
}
@@ -76,16 +78,11 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
work_tiles_(device, "work_tiles", MEM_READ_WRITE),
display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
- max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())),
- min_num_active_main_paths_(queue_->num_concurrent_busy_states()),
+ max_num_paths_(0),
+ min_num_active_main_paths_(0),
max_active_main_path_index_(0)
{
memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
-
- /* Limit number of active paths to the half of the overall state. This is due to the logic in the
- * path compaction which relies on the fact that regeneration does not happen sooner than half of
- * the states are available again. */
- min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2);
}
void PathTraceWorkGPU::alloc_integrator_soa()
@@ -103,6 +100,20 @@ void PathTraceWorkGPU::alloc_integrator_soa()
integrator_state_soa_volume_stack_size_ = max(integrator_state_soa_volume_stack_size_,
requested_volume_stack_size);
+ /* Deterine the number of path states. Deferring this for as long as possible allows the backend
+ * to make better decisions about memory availability. */
+ if (max_num_paths_ == 0) {
+ size_t single_state_size = estimate_single_state_size(kernel_features);
+
+ max_num_paths_ = queue_->num_concurrent_states(single_state_size);
+ min_num_active_main_paths_ = queue_->num_concurrent_busy_states(single_state_size);
+
+ /* Limit number of active paths to the half of the overall state. This is due to the logic in
+ * the path compaction which relies on the fact that regeneration does not happen sooner than
+ * half of the states are available again. */
+ min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2);
+ }
+
/* Allocate a device only memory buffer before for each struct member, and then
* write the pointers into a struct that resides in constant memory.
*
diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp
index 5f1c6c92b9d..0fe170b2791 100644
--- a/intern/cycles/integrator/work_balancer.cpp
+++ b/intern/cycles/integrator/work_balancer.cpp
@@ -17,6 +17,9 @@ void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
work_balance_infos[0].weight = 1.0;
return;
}
+ else if (num_infos == 0) {
+ return;
+ }
/* There is no statistics available, so start with an equal distribution. */
const double weight = 1.0 / num_infos;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 8f50c7586b8..81c5f593974 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -713,10 +713,17 @@ endif()
# oneAPI module
if(WITH_CYCLES_DEVICE_ONEAPI)
+ if(WITH_CYCLES_ONEAPI_BINARIES)
+ set(cycles_kernel_oneapi_lib_suffix "_aot")
+ else()
+ set(cycles_kernel_oneapi_lib_suffix "_jit")
+ endif()
+
if(WIN32)
- set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll)
+ set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.dll)
+ set(cycles_kernel_oneapi_linker_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.lib)
else()
- set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/libcycles_kernel_oneapi.so)
+ set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/libcycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.so)
endif()
set(cycles_oneapi_kernel_sources
@@ -727,16 +734,9 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
${SRC_UTIL_HEADERS}
)
- set (ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS 1)
+ set (SYCL_OFFLINE_COMPILER_PARALLEL_JOBS 1 CACHE STRING "Number of parallel compiler instances to use for device binaries compilation (expect ~8GB peak memory usage per instance).")
if (WITH_CYCLES_ONEAPI_BINARIES)
- cmake_host_system_information(RESULT AVAILABLE_MEMORY_AMOUNT QUERY AVAILABLE_PHYSICAL_MEMORY)
- # Conservative value of peak consumption here, just to be fully sure that other backend compilers will have enough memory as well
- set(ONEAPI_GPU_COMPILER_MEMORY_AT_PEAK_MB 8150)
- math(EXPR ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS "${AVAILABLE_MEMORY_AMOUNT} / ${ONEAPI_GPU_COMPILER_MEMORY_AT_PEAK_MB}")
- if (ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS LESS 1)
- set(ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS 1)
- endif()
- message(STATUS "${ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS} instance(s) of oneAPI offline compiler will be used.")
+ message(STATUS "${SYCL_OFFLINE_COMPILER_PARALLEL_JOBS} instance(s) of oneAPI offline compiler will be used.")
endif()
# SYCL_CPP_FLAGS is a variable that the user can set to pass extra compiler options
set(sycl_compiler_flags
@@ -747,7 +747,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
-mllvm -inlinedefault-threshold=250
-mllvm -inlinehint-threshold=350
-fsycl-device-code-split=per_kernel
- -fsycl-max-parallel-link-jobs=${ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS}
+ -fsycl-max-parallel-link-jobs=${SYCL_OFFLINE_COMPILER_PARALLEL_JOBS}
-shared
-DWITH_ONEAPI
-ffast-math
@@ -758,10 +758,6 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
${SYCL_CPP_FLAGS}
)
- if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED)
- list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED)
- endif()
-
# Set defaults for spir64 and spir64_gen options
if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
@@ -774,6 +770,8 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ")
if (WITH_CYCLES_ONEAPI_BINARIES)
+ # AoT binaries aren't currently reused when calling sycl::build.
+ list (APPEND sycl_compiler_flags -DSYCL_SKIP_KERNELS_PRELOAD)
# Iterate over all targest and their options
list (JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string)
list (APPEND sycl_compiler_flags -fsycl-targets=${targets_string})
@@ -826,12 +824,17 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
-DONEAPI_EXPORT)
string(REPLACE /Redist/ /Tools/ MSVC_TOOLS_DIR ${MSVC_REDIST_DIR})
- if(NOT CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION) # case for Ninja on Windows
+ # Version Folder between Redist and Tools can mismatch sometimes
+ if(NOT EXISTS ${MSVC_TOOLS_DIR})
+ get_filename_component(cmake_ar_dir ${CMAKE_AR} DIRECTORY)
+ get_filename_component(MSVC_TOOLS_DIR "${cmake_ar_dir}/../../../" ABSOLUTE)
+ endif()
+ if(CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION)
+ set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION})
+ else() # case for Ninja on Windows
get_filename_component(cmake_mt_dir ${CMAKE_MT} DIRECTORY)
string(REPLACE /bin/ /Lib/ WINDOWS_KIT_DIR ${cmake_mt_dir})
get_filename_component(WINDOWS_KIT_DIR "${WINDOWS_KIT_DIR}/../" ABSOLUTE)
- else()
- set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION})
endif()
list(APPEND sycl_compiler_flags
-L "${MSVC_TOOLS_DIR}/lib/x64"
@@ -843,15 +846,13 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
set(sycl_compiler_flags_RelWithDebInfo ${sycl_compiler_flags})
set(sycl_compiler_flags_MinSizeRel ${sycl_compiler_flags})
list(APPEND sycl_compiler_flags_RelWithDebInfo -g)
- get_filename_component(sycl_library_debug_name ${SYCL_LIBRARY_DEBUG} NAME_WE)
list(APPEND sycl_compiler_flags_Debug
-g
-D_DEBUG
- -nostdlib -Xclang --dependent-lib=msvcrtd
- -Xclang --dependent-lib=${sycl_library_debug_name})
+ -nostdlib -Xclang --dependent-lib=msvcrtd)
add_custom_command(
- OUTPUT ${cycles_kernel_oneapi_lib}
+ OUTPUT ${cycles_kernel_oneapi_lib} ${cycles_kernel_oneapi_linker_lib}
COMMAND ${CMAKE_COMMAND} -E env
"LIB=${sycl_compiler_root}/../lib" # for compiler to find sycl.lib
"PATH=${OCLOC_INSTALL_DIR}\;${sycl_compiler_root}"
diff --git a/intern/cycles/kernel/bvh/shadow_all.h b/intern/cycles/kernel/bvh/shadow_all.h
index 2ffe1496c72..b31ba479e4f 100644
--- a/intern/cycles/kernel/bvh/shadow_all.h
+++ b/intern/cycles/kernel/bvh/shadow_all.h
@@ -229,7 +229,7 @@ ccl_device_inline
/* Always use baked shadow transparency for curves. */
if (isect.type & PRIMITIVE_CURVE) {
*r_throughput *= intersection_curve_shadow_transparency(
- kg, isect.object, isect.prim, isect.u);
+ kg, isect.object, isect.prim, isect.type, isect.u);
if (*r_throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
return true;
diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h
index a57703a8b8c..9ba787550c5 100644
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -190,10 +190,8 @@ ccl_device_inline int intersection_find_attribute(KernelGlobals kg,
/* Cut-off value to stop transparent shadow tracing when practically opaque. */
#define CURVE_SHADOW_TRANSPARENCY_CUTOFF 0.001f
-ccl_device_inline float intersection_curve_shadow_transparency(KernelGlobals kg,
- const int object,
- const int prim,
- const float u)
+ccl_device_inline float intersection_curve_shadow_transparency(
+ KernelGlobals kg, const int object, const int prim, const int type, const float u)
{
/* Find attribute. */
const int offset = intersection_find_attribute(kg, object, ATTR_STD_SHADOW_TRANSPARENCY);
@@ -204,7 +202,7 @@ ccl_device_inline float intersection_curve_shadow_transparency(KernelGlobals kg,
/* Interpolate transparency between curve keys. */
const KernelCurve kcurve = kernel_data_fetch(curves, prim);
- const int k0 = kcurve.first_key + PRIMITIVE_UNPACK_SEGMENT(kcurve.type);
+ const int k0 = kcurve.first_key + PRIMITIVE_UNPACK_SEGMENT(type);
const int k1 = k0 + 1;
const float f0 = kernel_data_fetch(attributes_float, offset + k0);
diff --git a/intern/cycles/kernel/device/cpu/bvh.h b/intern/cycles/kernel/device/cpu/bvh.h
index d9267e1cd6d..2d7d8c2d704 100644
--- a/intern/cycles/kernel/device/cpu/bvh.h
+++ b/intern/cycles/kernel/device/cpu/bvh.h
@@ -252,7 +252,7 @@ ccl_device void kernel_embree_filter_occluded_func(const RTCFilterFunctionNArgum
/* Always use baked shadow transparency for curves. */
if (current_isect.type & PRIMITIVE_CURVE) {
ctx->throughput *= intersection_curve_shadow_transparency(
- kg, current_isect.object, current_isect.prim, current_isect.u);
+ kg, current_isect.object, current_isect.prim, current_isect.type, current_isect.u);
if (ctx->throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
ctx->opaque_hit = true;
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index c1df49c4f49..38cdcb572eb 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -23,22 +23,6 @@ CCL_NAMESPACE_BEGIN
* and keep device specific code in compat.h */
#ifdef __KERNEL_ONEAPI__
-# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-template<typename IsActiveOp>
-void cpu_serial_active_index_array_impl(const uint num_states,
- ccl_global int *ccl_restrict indices,
- ccl_global int *ccl_restrict num_indices,
- IsActiveOp is_active_op)
-{
- int write_index = 0;
- for (int state_index = 0; state_index < num_states; state_index++) {
- if (is_active_op(state_index))
- indices[write_index++] = state_index;
- }
- *num_indices = write_index;
- return;
-}
-# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
template<typename IsActiveOp>
void gpu_parallel_active_index_array_impl(const uint num_states,
@@ -182,18 +166,11 @@ __device__
num_simd_groups, \
simdgroup_offset)
#elif defined(__KERNEL_ONEAPI__)
-# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-# define gpu_parallel_active_index_array( \
- blocksize, num_states, indices, num_indices, is_active_op) \
- if (ccl_gpu_global_size_x() == 1) \
- cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
- else \
- gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
-# else
-# define gpu_parallel_active_index_array( \
- blocksize, num_states, indices, num_indices, is_active_op) \
- gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
-# endif
+
+# define gpu_parallel_active_index_array( \
+ blocksize, num_states, indices, num_indices, is_active_op) \
+ gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
+
#else
# define gpu_parallel_active_index_array( \
diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h
index 99cb1e3826e..e75ec9cadec 100644
--- a/intern/cycles/kernel/device/metal/context_begin.h
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -34,21 +34,48 @@ class MetalKernelContext {
kernel_assert(0);
return 0;
}
-
+
+#ifdef __KERNEL_METAL_INTEL__
+ template<typename TextureType, typename CoordsType>
+ inline __attribute__((__always_inline__))
+ auto ccl_gpu_tex_object_read_intel_workaround(TextureType texture_array,
+ const uint tid, const uint sid,
+ CoordsType coords) const
+ {
+ switch(sid) {
+ default:
+ case 0: return texture_array[tid].tex.sample(sampler(address::repeat, filter::nearest), coords);
+ case 1: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::nearest), coords);
+ case 2: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::nearest), coords);
+ case 3: return texture_array[tid].tex.sample(sampler(address::repeat, filter::linear), coords);
+ case 4: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::linear), coords);
+ case 5: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::linear), coords);
+ }
+ }
+#endif
+
// texture2d
template<>
inline __attribute__((__always_inline__))
float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const {
const uint tid(tex);
const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
+#else
+ return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y));
+#endif
}
template<>
inline __attribute__((__always_inline__))
float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const {
const uint tid(tex);
const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
+#else
+ return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y)).x;
+#endif
}
// texture3d
@@ -57,14 +84,22 @@ class MetalKernelContext {
float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const {
const uint tid(tex);
const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
+#else
+ return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z));
+#endif
}
template<>
inline __attribute__((__always_inline__))
float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const {
const uint tid(tex);
const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
+#else
+ return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z)).x;
+#endif
}
# include "kernel/device/gpu/image.h"
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
index 5646c7446db..8b69ee025cd 100644
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -228,7 +228,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
/* Always use baked shadow transparency for curves. */
if (type & PRIMITIVE_CURVE) {
float throughput = payload.throughput;
- throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, u);
+ throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, type, u);
payload.throughput = throughput;
payload.num_hits += 1;
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index 8ae40b0612e..dfaec65130c 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -55,18 +55,6 @@
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
#define ccl_gpu_kernel_threads(block_num_threads)
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-# define KG_ND_ITEMS \
- kg->nd_item_local_id_0 = item.get_local_id(0); \
- kg->nd_item_local_range_0 = item.get_local_range(0); \
- kg->nd_item_group_0 = item.get_group(0); \
- kg->nd_item_group_range_0 = item.get_group_range(0); \
- kg->nd_item_global_id_0 = item.get_global_id(0); \
- kg->nd_item_global_range_0 = item.get_global_range(0);
-#else
-# define KG_ND_ITEMS
-#endif
-
#define ccl_gpu_kernel_signature(name, ...) \
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
size_t kernel_global_size, \
@@ -76,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
(kg); \
cgh.parallel_for<class kernel_##name>( \
sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
- [=](sycl::nd_item<1> item) { \
- KG_ND_ITEMS
+ [=](sycl::nd_item<1> item) {
#define ccl_gpu_kernel_postfix \
}); \
@@ -95,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
} ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
/* GPU thread, block, grid size and index */
-#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
-# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
-# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
-# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
-# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
-# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
-# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
-#else
-# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
-# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
-# define ccl_gpu_block_idx_x (kg->nd_item_group_0)
-# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
-# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
-# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
-#endif
+#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
+#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
/* GPU warp synchronization */
-
#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
#ifdef __SYCL_DEVICE_ONLY__
diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h
index d60f4f135ba..116620eb725 100644
--- a/intern/cycles/kernel/device/oneapi/globals.h
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU {
#undef KERNEL_DATA_ARRAY
IntegratorStateGPU *integrator_state;
const KernelData *__data;
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
- size_t nd_item_local_id_0;
- size_t nd_item_local_range_0;
- size_t nd_item_group_0;
- size_t nd_item_group_range_0;
-
- size_t nd_item_global_id_0;
- size_t nd_item_global_range_0;
-#endif
} KernelGlobalsGPU;
typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
index 1d1700f036d..525ae288f0c 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -8,7 +8,7 @@
# include <map>
# include <set>
-# include <CL/sycl.hpp>
+# include <sycl/sycl.hpp>
# include "kernel/device/oneapi/compat.h"
# include "kernel/device/oneapi/globals.h"
@@ -25,38 +25,57 @@ void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
s_error_user_ptr = user_ptr;
}
-/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
- * also trigger runtime compilation of all existing oneAPI kernels */
+/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like
+ * memory allocations, memory transfers and execution of kernel with USM memory. */
bool oneapi_run_test_kernel(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
- size_t N = 8;
- sycl::buffer<float, 1> A(N);
- sycl::buffer<float, 1> B(N);
-
- {
- sycl::host_accessor A_host_acc(A, sycl::write_only);
- for (size_t i = (size_t)0; i < N; i++)
- A_host_acc[i] = rand() % 32;
- }
+ const size_t N = 8;
+ const size_t memory_byte_size = sizeof(int) * N;
+ bool is_computation_correct = true;
try {
- queue->submit([&](sycl::handler &cgh) {
- sycl::accessor A_acc(A, cgh, sycl::read_only);
- sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
+ int *A_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue);
+
+ for (size_t i = (size_t)0; i < N; i++) {
+ A_host[i] = rand() % 32;
+ }
+
+ int *A_device = (int *)sycl::malloc_device(memory_byte_size, *queue);
+ int *B_device = (int *)sycl::malloc_device(memory_byte_size, *queue);
- cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
+ queue->memcpy(A_device, A_host, memory_byte_size);
+ queue->wait_and_throw();
+
+ queue->submit([&](sycl::handler &cgh) {
+ cgh.parallel_for(N, [=](sycl::id<1> idx) { B_device[idx] = A_device[idx] + idx.get(0); });
});
queue->wait_and_throw();
- sycl::host_accessor A_host_acc(A, sycl::read_only);
- sycl::host_accessor B_host_acc(B, sycl::read_only);
+ int *B_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue);
+
+ queue->memcpy(B_host, B_device, memory_byte_size);
+ queue->wait_and_throw();
for (size_t i = (size_t)0; i < N; i++) {
- float result = A_host_acc[i] + B_host_acc[i];
- (void)result;
+ const int expected_result = i + A_host[i];
+ if (B_host[i] != expected_result) {
+ is_computation_correct = false;
+ if (s_error_cb) {
+ s_error_cb(("Incorrect result in test kernel execution - expected " +
+ std::to_string(expected_result) + ", got " + std::to_string(B_host[i]))
+ .c_str(),
+ s_error_user_ptr);
+ }
+ }
}
+
+ sycl::free(A_host, *queue);
+ sycl::free(B_host, *queue);
+ sycl::free(A_device, *queue);
+ sycl::free(B_device, *queue);
+ queue->wait_and_throw();
}
catch (sycl::exception const &e) {
if (s_error_cb) {
@@ -65,7 +84,7 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
return false;
}
- return true;
+ return is_computation_correct;
}
/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
@@ -123,6 +142,56 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
return std::min(limit_work_group_size, preferred_work_group_size);
}
+bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
+{
+# ifdef SYCL_SKIP_KERNELS_PRELOAD
+ (void)queue_;
+ (void)requested_features;
+# else
+ assert(queue_);
+ sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+
+ try {
+ sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
+ sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
+ {queue->get_device()});
+
+ for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) {
+ const std::string &kernel_name = kernel_id.get_name();
+
+ /* NOTE(@nsirgien): Names in this conditions below should match names from
+ * oneapi_call macro in oneapi_enqueue_kernel below */
+ if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) &&
+ kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) {
+ continue;
+ }
+
+ if (((requested_features & KERNEL_FEATURE_MNEE) == 0) &&
+ kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) {
+ continue;
+ }
+
+ if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) &&
+ kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") !=
+ std::string::npos) {
+ continue;
+ }
+
+ sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle =
+ sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
+ sycl::build(one_kernel_bundle);
+ }
+ }
+ catch (sycl::exception const &e) {
+ if (s_error_cb) {
+ s_error_cb(e.what(), s_error_user_ptr);
+ }
+ return false;
+ }
+# endif
+ return true;
+}
+
bool oneapi_enqueue_kernel(KernelContext *kernel_context,
int kernel,
size_t global_size,
@@ -161,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
* we extend work size to fit uniformity requirements. */
global_size = groups_count * local_size;
-
-# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
- if (queue->get_device().is_host()) {
- global_size = 1;
- local_size = 1;
- }
-# endif
}
/* Let the compiler throw an error if there are any kernels missing in this implementation. */
diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h
index 7456d0e4902..2bfc0b89c87 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -48,6 +48,8 @@ CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
int kernel,
size_t global_size,
void **args);
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue,
+ const unsigned int requested_features);
# ifdef __cplusplus
}
# endif
diff --git a/intern/cycles/kernel/device/optix/bvh.h b/intern/cycles/kernel/device/optix/bvh.h
index fb9907709ce..6d81b44660c 100644
--- a/intern/cycles/kernel/device/optix/bvh.h
+++ b/intern/cycles/kernel/device/optix/bvh.h
@@ -202,7 +202,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
/* Always use baked shadow transparency for curves. */
if (type & PRIMITIVE_CURVE) {
float throughput = __uint_as_float(optixGetPayload_1());
- throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
+ throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, type, u);
optixSetPayload_1(__float_as_uint(throughput));
optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
diff --git a/intern/cycles/kernel/integrator/mnee.h b/intern/cycles/kernel/integrator/mnee.h
index 038f0379bbc..23885306885 100644
--- a/intern/cycles/kernel/integrator/mnee.h
+++ b/intern/cycles/kernel/integrator/mnee.h
@@ -279,7 +279,15 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
}
/* Compute constraint derivatives. */
-ccl_device_forceinline bool mnee_compute_constraint_derivatives(
+
+# if defined(__KERNEL_METAL__)
+/* Temporary workaround for front-end compilation bug (incorrect MNEE rendering when this is
+ * inlined). */
+__attribute__((noinline))
+# else
+ccl_device_forceinline
+# endif
+bool mnee_compute_constraint_derivatives(
int vertex_count,
ccl_private ManifoldVertex *vertices,
ccl_private const float3 &surface_sample_pos,
diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h
index 4b03c665e17..40961b1c5fb 100644
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -76,6 +76,9 @@ ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
&kernel_integrator_state.next_shadow_path_index[0], 1);
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+# ifdef __PATH_GUIDING__
+ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, path_segment) = nullptr;
+# endif
return shadow_state;
}
@@ -181,6 +184,9 @@ ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
{
IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow;
INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+# ifdef __PATH_GUIDING__
+ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, path_segment) = nullptr;
+# endif
return shadow_state;
}
diff --git a/intern/cycles/kernel/sample/pattern.h b/intern/cycles/kernel/sample/pattern.h
index ebdecc1bff9..e12f333b3a5 100644
--- a/intern/cycles/kernel/sample/pattern.h
+++ b/intern/cycles/kernel/sample/pattern.h
@@ -100,7 +100,7 @@ ccl_device_inline bool sample_is_class_A(int pattern, int sample)
if (!(pattern == SAMPLING_PATTERN_PMJ || pattern == SAMPLING_PATTERN_SOBOL_BURLEY)) {
/* Fallback: assign samples randomly.
* This is guaranteed to work "okay" for any sampler, but isn't good.
- * (Note: the seed constant is just a random number to guard against
+ * (NOTE: the seed constant is just a random number to guard against
* possible interactions with other uses of the hash. There's nothing
* special about it.)
*/
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 1469d915d15..8f7cfd19169 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -85,9 +85,9 @@ CCL_NAMESPACE_BEGIN
# define __VOLUME_RECORD_ALL__
#endif /* !__KERNEL_GPU__ */
-/* MNEE currently causes "Compute function exceeds available temporary registers"
- * on Metal, disabled for now. */
-#ifndef __KERNEL_METAL__
+/* MNEE caused "Compute function exceeds available temporary registers" in macOS < 13 due to a bug
+ * in spill buffer allocation sizing. */
+#if !defined(__KERNEL_METAL__) || (__KERNEL_METAL_MACOS__ >= 13)
# define __MNEE__
#endif
diff --git a/intern/cycles/scene/image_oiio.cpp b/intern/cycles/scene/image_oiio.cpp
index 8792393e5a1..7bcf1ccb073 100644
--- a/intern/cycles/scene/image_oiio.cpp
+++ b/intern/cycles/scene/image_oiio.cpp
@@ -192,8 +192,22 @@ bool OIIOImageLoader::load_pixels(const ImageMetaData &metadata,
return false;
}
- const bool do_associate_alpha = associate_alpha &&
- spec.get_int_attribute("oiio:UnassociatedAlpha", 0);
+ bool do_associate_alpha = false;
+ if (associate_alpha) {
+ do_associate_alpha = spec.get_int_attribute("oiio:UnassociatedAlpha", 0);
+
+ if (!do_associate_alpha && spec.alpha_channel != -1) {
+ /* Workaround OIIO not detecting TGA file alpha the same as Blender (since #3019).
+ * We want anything not marked as premultiplied alpha to get associated. */
+ if (strcmp(in->format_name(), "targa") == 0) {
+ do_associate_alpha = spec.get_int_attribute("targa:alpha_type", -1) != 4;
+ }
+ /* OIIO DDS reader never sets UnassociatedAlpha attribute. */
+ if (strcmp(in->format_name(), "dds") == 0) {
+ do_associate_alpha = true;
+ }
+ }
+ }
switch (metadata.type) {
case IMAGE_DATA_TYPE_BYTE:
diff --git a/intern/cycles/session/session.cpp b/intern/cycles/session/session.cpp
index a0eb3196a34..acaa55f4990 100644
--- a/intern/cycles/session/session.cpp
+++ b/intern/cycles/session/session.cpp
@@ -43,6 +43,10 @@ Session::Session(const SessionParams &params_, const SceneParams &scene_params)
device = Device::create(params.device, stats, profiler);
+ if (device->have_error()) {
+ progress.set_error(device->error_message());
+ }
+
scene = new Scene(scene_params, device);
/* Configure path tracer. */
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index 0905b3ec5c9..3a2e0e074a2 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -417,15 +417,11 @@ ccl_device_inline int floor_to_int(float f)
return float_to_int(floorf(f));
}
-ccl_device_inline int quick_floor_to_int(float x)
-{
- return float_to_int(x) - ((x < 0) ? 1 : 0);
-}
-
ccl_device_inline float floorfrac(float x, ccl_private int *i)
{
- *i = quick_floor_to_int(x);
- return x - *i;
+ float f = floorf(x);
+ *i = float_to_int(f);
+ return x - f;
}
ccl_device_inline int ceil_to_int(float f)
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index c408eadf195..eec7122b9dc 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -535,18 +535,6 @@ ccl_device_inline float3 pow(float3 v, float e)
return make_float3(powf(v.x, e), powf(v.y, e), powf(v.z, e));
}
-ccl_device_inline int3 quick_floor_to_int3(const float3 a)
-{
-#ifdef __KERNEL_SSE__
- int3 b = int3(_mm_cvttps_epi32(a.m128));
- int3 isneg = int3(_mm_castps_si128(_mm_cmplt_ps(a.m128, _mm_set_ps1(0.0f))));
- /* Unsaturated add 0xffffffff is the same as subtract -1. */
- return b + isneg;
-#else
- return make_int3(quick_floor_to_int(a.x), quick_floor_to_int(a.y), quick_floor_to_int(a.z));
-#endif
-}
-
ccl_device_inline bool isfinite_safe(float3 v)
{
return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z);
diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h
index a2fff94303e..1e2bfa90354 100644
--- a/intern/cycles/util/ssef.h
+++ b/intern/cycles/util/ssef.h
@@ -5,6 +5,8 @@
#ifndef __UTIL_SSEF_H__
#define __UTIL_SSEF_H__
+#include <math.h>
+
#include "util/ssei.h"
CCL_NAMESPACE_BEGIN
@@ -521,7 +523,7 @@ __forceinline const ssef round_zero(const ssef &a)
__forceinline const ssef floor(const ssef &a)
{
# ifdef __KERNEL_NEON__
- return vrndnq_f32(a);
+ return vrndmq_f32(a);
# else
return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
# endif
@@ -534,6 +536,12 @@ __forceinline const ssef ceil(const ssef &a)
return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
# endif
}
+# else
+/* Non-SSE4.1 fallback, needed for floorfrac. */
+__forceinline const ssef floor(const ssef &a)
+{
+ return _mm_set_ps(floorf(a.f[3]), floorf(a.f[2]), floorf(a.f[1]), floorf(a.f[0]));
+}
# endif
__forceinline ssei truncatei(const ssef &a)
@@ -541,20 +549,11 @@ __forceinline ssei truncatei(const ssef &a)
return _mm_cvttps_epi32(a.m128);
}
-/* This is about 25% faster than straightforward floor to integer conversion
- * due to better pipelining.
- *
- * Unsaturated add 0xffffffff (a < 0) is the same as subtract -1.
- */
-__forceinline ssei floori(const ssef &a)
-{
- return truncatei(a) + cast((a < 0.0f).m128);
-}
-
__forceinline ssef floorfrac(const ssef &x, ssei *i)
{
- *i = floori(x);
- return x - ssef(*i);
+ ssef f = floor(x);
+ *i = truncatei(f);
+ return x - f;
}
////////////////////////////////////////////////////////////////////////////////