Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/blender/addon/__init__.py2
-rw-r--r--intern/cycles/blender/addon/engine.py5
-rw-r--r--intern/cycles/blender/addon/operators.py2
-rw-r--r--intern/cycles/blender/addon/properties.py2
-rw-r--r--intern/cycles/blender/addon/ui.py5
-rw-r--r--intern/cycles/device/cuda/device_impl.cpp6
-rw-r--r--intern/cycles/device/cuda/device_impl.h4
-rw-r--r--intern/cycles/device/device.h5
-rw-r--r--intern/cycles/device/hip/device_impl.h2
-rw-r--r--intern/cycles/device/kernel.cpp24
-rw-r--r--intern/cycles/device/kernel.h3
-rw-r--r--intern/cycles/device/metal/kernel.mm68
-rw-r--r--intern/cycles/device/multi/device.cpp15
-rw-r--r--intern/cycles/device/optix/device.cpp7
-rw-r--r--intern/cycles/device/optix/device_impl.cpp525
-rw-r--r--intern/cycles/device/optix/device_impl.h32
-rw-r--r--intern/cycles/device/optix/queue.cpp90
-rw-r--r--intern/cycles/kernel/CMakeLists.txt34
-rw-r--r--intern/cycles/kernel/closure/bsdf.h2
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.cpp9
-rw-r--r--intern/cycles/kernel/device/cuda/compat.h5
-rw-r--r--intern/cycles/kernel/device/hip/compat.h1
-rw-r--r--intern/cycles/kernel/device/metal/compat.h1
-rw-r--r--intern/cycles/kernel/device/oneapi/compat.h1
-rw-r--r--intern/cycles/kernel/device/optix/compat.h31
-rw-r--r--intern/cycles/kernel/device/optix/globals.h7
-rw-r--r--intern/cycles/kernel/device/optix/kernel_osl.cu83
-rw-r--r--intern/cycles/kernel/integrator/displacement_shader.h4
-rw-r--r--intern/cycles/kernel/integrator/init_from_bake.h7
-rw-r--r--intern/cycles/kernel/integrator/surface_shader.h9
-rw-r--r--intern/cycles/kernel/integrator/volume_shader.h4
-rw-r--r--intern/cycles/kernel/osl/closures.cpp282
-rw-r--r--intern/cycles/kernel/osl/closures_setup.h23
-rw-r--r--intern/cycles/kernel/osl/closures_template.h4
-rw-r--r--intern/cycles/kernel/osl/osl.h183
-rw-r--r--intern/cycles/kernel/osl/services.cpp62
-rw-r--r--intern/cycles/kernel/osl/services.h10
-rw-r--r--intern/cycles/kernel/osl/services_gpu.h2176
-rw-r--r--intern/cycles/kernel/osl/services_optix.cu17
-rw-r--r--intern/cycles/kernel/osl/shaders/node_geometry.osl5
-rw-r--r--intern/cycles/kernel/osl/shaders/node_normal_map.osl7
-rw-r--r--intern/cycles/kernel/osl/shaders/node_tangent.osl5
-rw-r--r--intern/cycles/kernel/osl/shaders/node_texture_coordinate.osl7
-rw-r--r--intern/cycles/kernel/osl/types.h102
-rw-r--r--intern/cycles/kernel/svm/noise.h268
-rw-r--r--intern/cycles/kernel/types.h17
-rw-r--r--intern/cycles/scene/osl.cpp284
-rw-r--r--intern/cycles/scene/osl.h18
-rw-r--r--intern/cycles/scene/scene.cpp7
-rw-r--r--intern/cycles/scene/shader.cpp9
-rw-r--r--intern/cycles/scene/shader.h2
-rw-r--r--intern/cycles/scene/shader_nodes.cpp10
-rw-r--r--intern/cycles/scene/shader_nodes.h9
-rw-r--r--intern/cycles/session/merge.h2
-rw-r--r--intern/cycles/test/CMakeLists.txt15
-rw-r--r--intern/cycles/test/util_avxf_test.h211
-rw-r--r--intern/cycles/test/util_float8_avx2_test.cpp (renamed from intern/cycles/test/util_avxf_avx2_test.cpp)4
-rw-r--r--intern/cycles/test/util_float8_avx_test.cpp (renamed from intern/cycles/test/util_avxf_avx_test.cpp)3
-rw-r--r--intern/cycles/test/util_float8_sse2_test.cpp12
-rw-r--r--intern/cycles/test/util_float8_test.h103
-rw-r--r--intern/cycles/util/CMakeLists.txt9
-rw-r--r--intern/cycles/util/avxb.h230
-rw-r--r--intern/cycles/util/avxf.h379
-rw-r--r--intern/cycles/util/avxi.h732
-rw-r--r--intern/cycles/util/color.h48
-rw-r--r--intern/cycles/util/defines.h1
-rw-r--r--intern/cycles/util/half.h16
-rw-r--r--intern/cycles/util/hash.h52
-rw-r--r--intern/cycles/util/math.h4
-rw-r--r--intern/cycles/util/math_float2.h133
-rw-r--r--intern/cycles/util/math_float3.h218
-rw-r--r--intern/cycles/util/math_float4.h475
-rw-r--r--intern/cycles/util/math_float8.h483
-rw-r--r--intern/cycles/util/math_int2.h17
-rw-r--r--intern/cycles/util/math_int3.h29
-rw-r--r--intern/cycles/util/math_int4.h216
-rw-r--r--intern/cycles/util/math_int8.h355
-rw-r--r--intern/cycles/util/math_intersect.h11
-rw-r--r--intern/cycles/util/path.cpp56
-rw-r--r--intern/cycles/util/path.h11
-rw-r--r--intern/cycles/util/sseb.h345
-rw-r--r--intern/cycles/util/ssef.h1090
-rw-r--r--intern/cycles/util/ssei.h633
-rw-r--r--intern/cycles/util/transform.cpp2
-rw-r--r--intern/cycles/util/transform.h49
-rw-r--r--intern/cycles/util/transform_inverse.h27
-rw-r--r--intern/cycles/util/types.h14
-rw-r--r--intern/cycles/util/types_float8.h29
-rw-r--r--intern/cycles/util/types_float8_impl.h63
-rw-r--r--intern/cycles/util/types_int8.h51
-rw-r--r--intern/cycles/util/types_int8_impl.h95
-rw-r--r--intern/ffmpeg/CMakeLists.txt1
-rw-r--r--intern/ffmpeg/ffmpeg_compat.h8
-rw-r--r--intern/ffmpeg/tests/ffmpeg_codecs.cc6
-rw-r--r--intern/ghost/intern/GHOST_SystemWayland.cpp66
-rw-r--r--intern/ghost/intern/GHOST_WindowWayland.cpp12
-rw-r--r--intern/mantaflow/intern/MANTA_main.cpp2
-rw-r--r--intern/wayland_dynload/extern/wayland_dynload_client.h3
98 files changed, 5543 insertions, 5280 deletions
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 05f27bdbd4d..354c9c23a53 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -58,7 +58,7 @@ class CyclesRender(bpy.types.RenderEngine):
if not self.session:
if self.is_preview:
cscene = bpy.context.scene.cycles
- use_osl = cscene.shading_system and cscene.device == 'CPU'
+ use_osl = cscene.shading_system
engine.create(self, data, preview_osl=use_osl)
else:
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index e33891fa7a2..4ac078ed8a5 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -156,6 +156,11 @@ def with_osl():
return _cycles.with_osl
+def osl_version():
+ import _cycles
+ return _cycles.osl_version
+
+
def with_path_guiding():
import _cycles
return _cycles.with_path_guiding
diff --git a/intern/cycles/blender/addon/operators.py b/intern/cycles/blender/addon/operators.py
index ab474cda0ab..3680d11359e 100644
--- a/intern/cycles/blender/addon/operators.py
+++ b/intern/cycles/blender/addon/operators.py
@@ -114,7 +114,7 @@ class CYCLES_OT_denoise_animation(Operator):
class CYCLES_OT_merge_images(Operator):
- "Combine OpenEXR multilayer images rendered with different sample " \
+ "Combine OpenEXR multi-layer images rendered with different sample " \
"ranges into one image with reduced noise"
bl_idname = "cycles.merge_images"
bl_label = "Merge Images"
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index f5cd88f6b6a..9d7c71417f2 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -290,7 +290,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
)
shading_system: BoolProperty(
name="Open Shading Language",
- description="Use Open Shading Language (CPU rendering only)",
+ description="Use Open Shading Language",
)
preview_pause: BoolProperty(
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 305accc8f1a..10a37688f45 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -2305,7 +2305,10 @@ def draw_device(self, context):
col.prop(cscene, "device")
from . import engine
- if engine.with_osl() and use_cpu(context):
+ if engine.with_osl() and (
+ use_cpu(context) or
+ (use_optix(context) and (engine.osl_version()[1] >= 13 or engine.osl_version()[0] > 1))
+ ):
col.prop(cscene, "shading_system")
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 01c021551f3..c9764d1c21b 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -232,7 +232,7 @@ string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
return cflags;
}
-string CUDADevice::compile_kernel(const uint kernel_features,
+string CUDADevice::compile_kernel(const string &common_cflags,
const char *name,
const char *base,
bool force_ptx)
@@ -281,7 +281,6 @@ string CUDADevice::compile_kernel(const uint kernel_features,
/* We include cflags into md5 so changing cuda toolkit or changing other
* compiler command line arguments makes sure cubin gets re-built.
*/
- string common_cflags = compile_kernel_get_common_cflags(kernel_features);
const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
@@ -417,7 +416,8 @@ bool CUDADevice::load_kernels(const uint kernel_features)
/* get kernel */
const char *kernel_name = "kernel";
- string cubin = compile_kernel(kernel_features, kernel_name);
+ string cflags = compile_kernel_get_common_cflags(kernel_features);
+ string cubin = compile_kernel(cflags, kernel_name);
if (cubin.empty())
return false;
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
index a754c33f79d..c18f2811161 100644
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -77,9 +77,9 @@ class CUDADevice : public Device {
bool use_adaptive_compilation();
- virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+ string compile_kernel_get_common_cflags(const uint kernel_features);
- string compile_kernel(const uint kernel_features,
+ string compile_kernel(const string &cflags,
const char *name,
const char *base = "cuda",
bool force_ptx = false);
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 2e4d18241cf..06a2f5c7b01 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -160,6 +160,11 @@ class Device {
return true;
}
+ virtual bool load_osl_kernels()
+ {
+ return true;
+ }
+
/* GPU device only functions.
* These may not be used on CPU or multi-devices. */
diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h
index 9afef3789af..efdc15dca79 100644
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -74,7 +74,7 @@ class HIPDevice : public Device {
bool use_adaptive_compilation();
- virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+ string compile_kernel_get_common_cflags(const uint kernel_features);
string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip");
diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp
index 96a99cd62cd..27ca0d81817 100644
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -7,6 +7,30 @@
CCL_NAMESPACE_BEGIN
+bool device_kernel_has_shading(DeviceKernel kernel)
+{
+ return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW ||
+ kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+ kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+ kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY);
+}
+
+bool device_kernel_has_intersection(DeviceKernel kernel)
+{
+ return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
+}
+
const char *device_kernel_as_string(DeviceKernel kernel)
{
switch (kernel) {
diff --git a/intern/cycles/device/kernel.h b/intern/cycles/device/kernel.h
index 4ae461f1f67..b829a891260 100644
--- a/intern/cycles/device/kernel.h
+++ b/intern/cycles/device/kernel.h
@@ -11,6 +11,9 @@
CCL_NAMESPACE_BEGIN
+bool device_kernel_has_shading(DeviceKernel kernel);
+bool device_kernel_has_intersection(DeviceKernel kernel);
+
const char *device_kernel_as_string(DeviceKernel kernel);
std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 55938d1a03a..35cf832c537 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -45,6 +45,36 @@ bool kernel_has_intersection(DeviceKernel device_kernel)
struct ShaderCache {
ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
{
+ /* Initialize occupancy tuning LUT. */
+ if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) {
+ switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
+ default:
+ case APPLE_M2:
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
+ break;
+ case APPLE_M1:
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
+ occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
+ break;
+ }
+ }
}
~ShaderCache();
@@ -73,6 +103,11 @@ struct ShaderCache {
std::function<void(MetalKernelPipeline *)> completionHandler;
};
+ struct OccupancyTuningParameters {
+ int threads_per_threadgroup = 0;
+ int num_threads_per_block = 0;
+ } occupancy_tuning[DEVICE_KERNEL_NUM];
+
std::mutex cache_mutex;
PipelineCollection pipelines[DEVICE_KERNEL_NUM];
@@ -230,6 +265,13 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
request.pipeline->device_kernel = device_kernel;
request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
+ if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
+ request.pipeline->threads_per_threadgroup =
+ occupancy_tuning[device_kernel].threads_per_threadgroup;
+ request.pipeline->num_threads_per_block =
+ occupancy_tuning[device_kernel].num_threads_per_block;
+ }
+
/* metalrt options */
request.pipeline->use_metalrt = device->use_metalrt;
request.pipeline->metalrt_hair = device->use_metalrt &&
@@ -374,13 +416,6 @@ void MetalKernelPipeline::compile()
const std::string function_name = std::string("cycles_metal_") +
device_kernel_as_string(device_kernel);
- int threads_per_threadgroup = this->threads_per_threadgroup;
- if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
- device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
- /* Always use 512 for the sorting kernels */
- threads_per_threadgroup = 512;
- }
-
NSString *entryPoint = [@(function_name.c_str()) copy];
NSError *error = NULL;
@@ -583,7 +618,9 @@ void MetalKernelPipeline::compile()
metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
path_create_directories(metalbin_path);
- if (path_exists(metalbin_path) && use_binary_archive) {
+ /* Retrieve shader binary from disk, and update the file timestamp for LRU purging to work as
+ * intended. */
+ if (use_binary_archive && path_cache_kernel_exists_and_mark_used(metalbin_path)) {
if (@available(macOS 11.0, *)) {
MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
@@ -644,12 +681,14 @@ void MetalKernelPipeline::compile()
return;
}
- int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
- computePipelineState.threadExecutionWidth);
- num_threads_per_block = std::max(num_threads_per_block,
- (int)computePipelineState.threadExecutionWidth);
+ if (!num_threads_per_block) {
+ num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+ computePipelineState.threadExecutionWidth);
+ num_threads_per_block = std::max(num_threads_per_block,
+ (int)computePipelineState.threadExecutionWidth);
+ }
+
this->pipeline = computePipelineState;
- this->num_threads_per_block = num_threads_per_block;
if (@available(macOS 11.0, *)) {
if (creating_new_archive || recreate_archive) {
@@ -658,6 +697,9 @@ void MetalKernelPipeline::compile()
metal_printf("Failed to save binary archive, error:\n%s\n",
[[error localizedDescription] UTF8String]);
}
+ else {
+ path_cache_kernel_mark_added_and_clear_old(metalbin_path);
+ }
}
}
};
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
index 6904d2c2dc6..9605c6a7538 100644
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -138,6 +138,15 @@ class MultiDevice : public Device {
return true;
}
+ bool load_osl_kernels() override
+ {
+ foreach (SubDevice &sub, devices)
+ if (!sub.device->load_osl_kernels())
+ return false;
+
+ return true;
+ }
+
void build_bvh(BVH *bvh, Progress &progress, bool refit) override
{
/* Try to build and share a single acceleration structure, if possible */
@@ -204,10 +213,12 @@ class MultiDevice : public Device {
virtual void *get_cpu_osl_memory() override
{
- if (devices.size() > 1) {
+ /* Always return the OSL memory of the CPU device (this works since the constructor above
+ * guarantees that CPU devices are always added to the back). */
+ if (devices.size() > 1 && devices.back().device->info.type != DEVICE_CPU) {
return NULL;
}
- return devices.front().device->get_cpu_osl_memory();
+ return devices.back().device->get_cpu_osl_memory();
}
bool is_resident(device_ptr key, Device *sub_device) override
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
index 68ca21374fd..58b72374a7d 100644
--- a/intern/cycles/device/optix/device.cpp
+++ b/intern/cycles/device/optix/device.cpp
@@ -9,6 +9,10 @@
#include "util/log.h"
+#ifdef WITH_OSL
+# include <OSL/oslversion.h>
+#endif
+
#ifdef WITH_OPTIX
# include <optix_function_table_definition.h>
#endif
@@ -65,6 +69,9 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo
info.type = DEVICE_OPTIX;
info.id += "_OptiX";
+# if defined(WITH_OSL) && (OSL_VERSION_MINOR >= 13 || OSL_VERSION_MAJOR > 1)
+ info.has_osl = true;
+# endif
info.denoisers |= DENOISER_OPTIX;
devices.push_back(info);
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 6c64e7106d5..02f34bf3bd0 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -312,16 +312,34 @@ OptiXDevice::~OptiXDevice()
if (optix_module != NULL) {
optixModuleDestroy(optix_module);
}
- for (unsigned int i = 0; i < 2; ++i) {
+ for (int i = 0; i < 2; ++i) {
if (builtin_modules[i] != NULL) {
optixModuleDestroy(builtin_modules[i]);
}
}
- for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+ for (int i = 0; i < NUM_PIPELINES; ++i) {
if (pipelines[i] != NULL) {
optixPipelineDestroy(pipelines[i]);
}
}
+ for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ if (groups[i] != NULL) {
+ optixProgramGroupDestroy(groups[i]);
+ }
+ }
+
+# ifdef WITH_OSL
+ for (const OptixModule &module : osl_modules) {
+ if (module != NULL) {
+ optixModuleDestroy(module);
+ }
+ }
+ for (const OptixProgramGroup &group : osl_groups) {
+ if (group != NULL) {
+ optixProgramGroupDestroy(group);
+ }
+ }
+# endif
/* Make sure denoiser is destroyed before device context! */
if (denoiser_.optix_denoiser != nullptr) {
@@ -381,13 +399,51 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
return false;
}
+# ifdef WITH_OSL
+ const bool use_osl = (kernel_features & KERNEL_FEATURE_OSL);
+# else
+ const bool use_osl = false;
+# endif
+
+ /* Skip creating OptiX module if only doing denoising. */
+ const bool need_optix_kernels = (kernel_features &
+ (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING));
+
+ /* Detect existence of OptiX kernel and SDK here early. So we can error out
+ * before compiling the CUDA kernels, to avoid failing right after when
+ * compiling the OptiX kernel. */
+ string suffix = use_osl ? "_osl" :
+ (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
+ "_shader_raytrace" :
+ "";
+ string ptx_filename;
+ if (need_optix_kernels) {
+ ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx");
+ if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+ std::string optix_include_dir = get_optix_include_dir();
+ if (optix_include_dir.empty()) {
+ set_error(
+ "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable "
+ "to a directory containing the OptiX SDK.");
+ return false;
+ }
+ else if (!path_is_directory(optix_include_dir)) {
+ set_error(string_printf(
+ "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install "
+ "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a "
+ "directory containing the OptiX SDK.",
+ optix_include_dir.c_str()));
+ return false;
+ }
+ }
+ }
+
/* Load CUDA modules because we need some of the utility kernels. */
if (!CUDADevice::load_kernels(kernel_features)) {
return false;
}
- /* Skip creating OptiX module if only doing denoising. */
- if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) {
+ if (!need_optix_kernels) {
return true;
}
@@ -398,18 +454,41 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
optixModuleDestroy(optix_module);
optix_module = NULL;
}
- for (unsigned int i = 0; i < 2; ++i) {
+ for (int i = 0; i < 2; ++i) {
if (builtin_modules[i] != NULL) {
optixModuleDestroy(builtin_modules[i]);
builtin_modules[i] = NULL;
}
}
- for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+ for (int i = 0; i < NUM_PIPELINES; ++i) {
if (pipelines[i] != NULL) {
optixPipelineDestroy(pipelines[i]);
pipelines[i] = NULL;
}
}
+ for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ if (groups[i] != NULL) {
+ optixProgramGroupDestroy(groups[i]);
+ groups[i] = NULL;
+ }
+ }
+
+# ifdef WITH_OSL
+ /* Recreating base OptiX module invalidates all OSL modules too, since they link against it. */
+ for (const OptixModule &module : osl_modules) {
+ if (module != NULL) {
+ optixModuleDestroy(module);
+ }
+ }
+ osl_modules.clear();
+
+ for (const OptixProgramGroup &group : osl_groups) {
+ if (group != NULL) {
+ optixProgramGroupDestroy(group);
+ }
+ }
+ osl_groups.clear();
+# endif
OptixModuleCompileOptions module_options = {};
module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
@@ -430,7 +509,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
module_options.numPayloadTypes = 0;
# endif
- OptixPipelineCompileOptions pipeline_options = {};
/* Default to no motion blur and two-level graph, since it is the fastest option. */
pipeline_options.usesMotionBlur = false;
pipeline_options.traversableGraphFlags =
@@ -459,9 +537,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
/* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
* This is necessary since objects may be reported to have motion if the Vector pass is
* active, but may still need to be rendered without motion blur if that isn't active as well. */
- motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
-
- if (motion_blur) {
+ if (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) {
pipeline_options.usesMotionBlur = true;
/* Motion blur can insert motion transforms into the traversal graph.
* It is no longer a two-level graph then, so need to set flags to allow any configuration. */
@@ -469,33 +545,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
}
{ /* Load and compile PTX module with OptiX kernels. */
- string ptx_data, ptx_filename = path_get(
- (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
- "lib/kernel_optix_shader_raytrace.ptx" :
- "lib/kernel_optix.ptx");
+ string ptx_data;
if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
- std::string optix_include_dir = get_optix_include_dir();
- if (optix_include_dir.empty()) {
- set_error(
- "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable "
- "to a directory containing the OptiX SDK.");
- return false;
- }
- else if (!path_is_directory(optix_include_dir)) {
- set_error(string_printf(
- "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install "
- "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a "
- "directory containing the OptiX SDK.",
- optix_include_dir.c_str()));
- return false;
- }
- ptx_filename = compile_kernel(
- kernel_features,
- (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
- "kernel_shader_raytrace" :
- "kernel",
- "optix",
- true);
+ string cflags = compile_kernel_get_common_cflags(kernel_features);
+ ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true);
}
if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
@@ -537,7 +590,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
}
/* Create program groups. */
- OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
@@ -595,7 +647,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
- if (motion_blur) {
+ if (pipeline_options.usesMotionBlur) {
builtin_options.usesMotionBlur = true;
optix_assert(optixBuiltinISModuleGet(
@@ -616,7 +668,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
}
}
- /* Pointclouds */
if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD];
group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
@@ -628,8 +679,8 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
}
+ /* Add hit group for local intersections. */
if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
- /* Add hit group for local intersections. */
group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
@@ -641,16 +692,19 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
"__raygen__kernel_optix_integrator_shade_surface_raytrace";
- group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
- group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
- group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
- group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
- group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
- group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
- "__direct_callable__svm_node_bevel";
+
+ /* Kernels with OSL support are built without SVM, so can skip those direct callables there. */
+ if (!use_osl) {
+ group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+ group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+ "__direct_callable__svm_node_bevel";
+ }
}
- /* MNEE. */
if (kernel_features & KERNEL_FEATURE_MNEE) {
group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module;
@@ -658,6 +712,42 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
"__raygen__kernel_optix_integrator_shade_surface_mnee";
}
+ /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+ if (use_osl) {
+ group_descs[PG_RGEN_SHADE_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.module = optix_module;
+ group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_shade_background";
+ group_descs[PG_RGEN_SHADE_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_SHADE_LIGHT].raygen.module = optix_module;
+ group_descs[PG_RGEN_SHADE_LIGHT].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_shade_light";
+ group_descs[PG_RGEN_SHADE_SURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_SHADE_SURFACE].raygen.module = optix_module;
+ group_descs[PG_RGEN_SHADE_SURFACE].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_shade_surface";
+ group_descs[PG_RGEN_SHADE_VOLUME].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_SHADE_VOLUME].raygen.module = optix_module;
+ group_descs[PG_RGEN_SHADE_VOLUME].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_shade_volume";
+ group_descs[PG_RGEN_SHADE_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_SHADE_SHADOW].raygen.module = optix_module;
+ group_descs[PG_RGEN_SHADE_SHADOW].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_shade_shadow";
+ group_descs[PG_RGEN_EVAL_DISPLACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_EVAL_DISPLACE].raygen.module = optix_module;
+ group_descs[PG_RGEN_EVAL_DISPLACE].raygen.entryFunctionName =
+ "__raygen__kernel_optix_shader_eval_displace";
+ group_descs[PG_RGEN_EVAL_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.module = optix_module;
+ group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.entryFunctionName =
+ "__raygen__kernel_optix_shader_eval_background";
+ group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.module = optix_module;
+ group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.entryFunctionName =
+ "__raygen__kernel_optix_shader_eval_curve_shadow_transparency";
+ }
+
optix_assert(optixProgramGroupCreate(
context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
@@ -666,7 +756,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
/* Set up SBT, which in this case is used only to select between different programs. */
sbt_data.alloc(NUM_PROGRAM_GROUPS);
memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
- for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
}
@@ -690,25 +780,26 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
OptixPipelineLinkOptions link_options = {};
link_options.maxTraceDepth = 1;
+ link_options.debugLevel = module_options.debugLevel;
- if (DebugFlags().optix.use_debug) {
- link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
- }
- else {
- link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
- }
-
- if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
- /* Create shader raytracing pipeline. */
+ if (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE) && !use_osl) {
+ /* Create shader raytracing and MNEE pipeline. */
vector<OptixProgramGroup> pipeline_groups;
pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
- pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+ pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+ pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+ }
+ if (kernel_features & KERNEL_FEATURE_MNEE) {
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+ }
pipeline_groups.push_back(groups[PG_MISS]);
pipeline_groups.push_back(groups[PG_HITD]);
pipeline_groups.push_back(groups[PG_HITS]);
pipeline_groups.push_back(groups[PG_HITL]);
pipeline_groups.push_back(groups[PG_HITV]);
- if (motion_blur) {
+ if (pipeline_options.usesMotionBlur) {
pipeline_groups.push_back(groups[PG_HITD_MOTION]);
pipeline_groups.push_back(groups[PG_HITS_MOTION]);
}
@@ -716,8 +807,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
}
- pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
- pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
optix_assert(optixPipelineCreate(context,
&pipeline_options,
@@ -726,30 +815,33 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
pipeline_groups.size(),
nullptr,
0,
- &pipelines[PIP_SHADE_RAYTRACE]));
+ &pipelines[PIP_SHADE]));
/* Combine ray generation and trace continuation stack size. */
- const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+ const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
+ stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG) +
link_options.maxTraceDepth * trace_css;
const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
stack_size[PG_CALL_SVM_BEVEL].dssDC);
/* Set stack size depending on pipeline options. */
optix_assert(optixPipelineSetStackSize(
- pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+ pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
}
- if (kernel_features & KERNEL_FEATURE_MNEE) {
- /* Create MNEE pipeline. */
+ { /* Create intersection-only pipeline. */
vector<OptixProgramGroup> pipeline_groups;
pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
- pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
pipeline_groups.push_back(groups[PG_MISS]);
pipeline_groups.push_back(groups[PG_HITD]);
pipeline_groups.push_back(groups[PG_HITS]);
pipeline_groups.push_back(groups[PG_HITL]);
pipeline_groups.push_back(groups[PG_HITV]);
- if (motion_blur) {
+ if (pipeline_options.usesMotionBlur) {
pipeline_groups.push_back(groups[PG_HITD_MOTION]);
pipeline_groups.push_back(groups[PG_HITS_MOTION]);
}
@@ -757,8 +849,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
}
- pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
- pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
optix_assert(optixPipelineCreate(context,
&pipeline_options,
@@ -767,37 +857,234 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
pipeline_groups.size(),
nullptr,
0,
- &pipelines[PIP_SHADE_MNEE]));
+ &pipelines[PIP_INTERSECT]));
- /* Combine ray generation and trace continuation stack size. */
- const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG +
- link_options.maxTraceDepth * trace_css;
- const unsigned int dss = 0;
+ /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+ const unsigned int css =
+ std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+ std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+ std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+ stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+ link_options.maxTraceDepth * trace_css;
- /* Set stack size depending on pipeline options. */
- optix_assert(
- optixPipelineSetStackSize(pipelines[PIP_SHADE_MNEE], 0, dss, css, motion_blur ? 3 : 2));
+ optix_assert(optixPipelineSetStackSize(
+ pipelines[PIP_INTERSECT], 0, 0, css, pipeline_options.usesMotionBlur ? 3 : 2));
}
- { /* Create intersection-only pipeline. */
+ return !have_error();
+}
+
+bool OptiXDevice::load_osl_kernels()
+{
+# ifdef WITH_OSL
+ if (have_error()) {
+ return false;
+ }
+
+ struct OSLKernel {
+ string ptx;
+ string init_entry;
+ string exec_entry;
+ };
+
+ /* This has to be in the same order as the ShaderType enum, so that the index calculation in
+ * osl_eval_nodes checks out */
+ vector<OSLKernel> osl_kernels;
+
+ for (ShaderType type = SHADER_TYPE_SURFACE; type <= SHADER_TYPE_BUMP;
+ type = static_cast<ShaderType>(type + 1)) {
+ const vector<OSL::ShaderGroupRef> &groups = (type == SHADER_TYPE_SURFACE ?
+ osl_globals.surface_state :
+ type == SHADER_TYPE_VOLUME ?
+ osl_globals.volume_state :
+ type == SHADER_TYPE_DISPLACEMENT ?
+ osl_globals.displacement_state :
+ osl_globals.bump_state);
+ for (const OSL::ShaderGroupRef &group : groups) {
+ if (group) {
+ string osl_ptx, init_name, entry_name;
+ osl_globals.ss->getattribute(group.get(), "group_init_name", init_name);
+ osl_globals.ss->getattribute(group.get(), "group_entry_name", entry_name);
+ osl_globals.ss->getattribute(
+ group.get(), "ptx_compiled_version", OSL::TypeDesc::PTR, &osl_ptx);
+
+ int groupdata_size = 0;
+ osl_globals.ss->getattribute(group.get(), "groupdata_size", groupdata_size);
+ if (groupdata_size > 2048) { /* See 'group_data' array in kernel/osl/osl.h */
+ set_error(
+ string_printf("Requested OSL group data size (%d) is greater than the maximum "
+ "supported with OptiX (2048)",
+ groupdata_size));
+ return false;
+ }
+
+ osl_kernels.push_back({std::move(osl_ptx), std::move(init_name), std::move(entry_name)});
+ }
+ else {
+ /* Add empty entry for non-existent shader groups, so that the index stays stable. */
+ osl_kernels.emplace_back();
+ }
+ }
+ }
+
+ const CUDAContextScope scope(this);
+
+ if (pipelines[PIP_SHADE]) {
+ optixPipelineDestroy(pipelines[PIP_SHADE]);
+ }
+
+ for (OptixModule &module : osl_modules) {
+ if (module != NULL) {
+ optixModuleDestroy(module);
+ module = NULL;
+ }
+ }
+ for (OptixProgramGroup &group : osl_groups) {
+ if (group != NULL) {
+ optixProgramGroupDestroy(group);
+ group = NULL;
+ }
+ }
+
+ OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+ OptixModuleCompileOptions module_options = {};
+ module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+ module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
+
+ osl_groups.resize(osl_kernels.size() * 2 + 1);
+ osl_modules.resize(osl_kernels.size() + 1);
+
+ { /* Load and compile PTX module with OSL services. */
+ string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx");
+ if (!path_read_text(ptx_filename, ptx_data)) {
+ set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'",
+ ptx_filename.c_str()));
+ return false;
+ }
+
+ const OptixResult result = optixModuleCreateFromPTX(context,
+ &module_options,
+ &pipeline_options,
+ ptx_data.data(),
+ ptx_data.size(),
+ nullptr,
+ 0,
+ &osl_modules.back());
+ if (result != OPTIX_SUCCESS) {
+ set_error(string_printf("Failed to load OptiX OSL services kernel from '%s' (%s)",
+ ptx_filename.c_str(),
+ optixGetErrorName(result)));
+ return false;
+ }
+
+ OptixProgramGroupDesc group_desc = {};
+ group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_services";
+ group_desc.callables.moduleDC = osl_modules.back();
+
+ optix_assert(optixProgramGroupCreate(
+ context, &group_desc, 1, &group_options, nullptr, 0, &osl_groups.back()));
+ }
+
+ TaskPool pool;
+ vector<OptixResult> results(osl_kernels.size(), OPTIX_SUCCESS);
+
+ for (size_t i = 0; i < osl_kernels.size(); ++i) {
+ if (osl_kernels[i].ptx.empty()) {
+ continue;
+ }
+
+# if OPTIX_ABI_VERSION >= 55
+ OptixTask task = nullptr;
+ results[i] = optixModuleCreateFromPTXWithTasks(context,
+ &module_options,
+ &pipeline_options,
+ osl_kernels[i].ptx.data(),
+ osl_kernels[i].ptx.size(),
+ nullptr,
+ nullptr,
+ &osl_modules[i],
+ &task);
+ if (results[i] == OPTIX_SUCCESS) {
+ execute_optix_task(pool, task, results[i]);
+ }
+# else
+ pool.push([this, &results, i, &module_options, &osl_kernels]() {
+ results[i] = optixModuleCreateFromPTX(context,
+ &module_options,
+ &pipeline_options,
+ osl_kernels[i].ptx.data(),
+ osl_kernels[i].ptx.size(),
+ nullptr,
+ 0,
+ &osl_modules[i]);
+ });
+# endif
+ }
+
+ pool.wait_work();
+
+ for (size_t i = 0; i < osl_kernels.size(); ++i) {
+ if (osl_kernels[i].ptx.empty()) {
+ continue;
+ }
+
+ if (results[i] != OPTIX_SUCCESS) {
+ set_error(string_printf("Failed to load OptiX OSL kernel for %s (%s)",
+ osl_kernels[i].init_entry.c_str(),
+ optixGetErrorName(results[i])));
+ return false;
+ }
+
+ OptixProgramGroupDesc group_descs[2] = {};
+ group_descs[0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[0].callables.entryFunctionNameDC = osl_kernels[i].init_entry.c_str();
+ group_descs[0].callables.moduleDC = osl_modules[i];
+ group_descs[1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[1].callables.entryFunctionNameDC = osl_kernels[i].exec_entry.c_str();
+ group_descs[1].callables.moduleDC = osl_modules[i];
+
+ optix_assert(optixProgramGroupCreate(
+ context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
+ }
+
+ vector<OptixStackSizes> osl_stack_size(osl_groups.size());
+
+ /* Update SBT with new entries. */
+ sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
+ for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+ }
+ for (size_t i = 0; i < osl_groups.size(); ++i) {
+ if (osl_groups[i] != NULL) {
+ optix_assert(optixSbtRecordPackHeader(osl_groups[i], &sbt_data[NUM_PROGRAM_GROUPS + i]));
+ optix_assert(optixProgramGroupGetStackSize(osl_groups[i], &osl_stack_size[i]));
+ }
+ }
+ sbt_data.copy_to_device(); /* Upload updated SBT to device. */
+
+ OptixPipelineLinkOptions link_options = {};
+ link_options.maxTraceDepth = 0;
+ link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
+
+ {
vector<OptixProgramGroup> pipeline_groups;
pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
- pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
- pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
- pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
- pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
- pipeline_groups.push_back(groups[PG_MISS]);
- pipeline_groups.push_back(groups[PG_HITD]);
- pipeline_groups.push_back(groups[PG_HITS]);
- pipeline_groups.push_back(groups[PG_HITL]);
- pipeline_groups.push_back(groups[PG_HITV]);
- if (motion_blur) {
- pipeline_groups.push_back(groups[PG_HITD_MOTION]);
- pipeline_groups.push_back(groups[PG_HITS_MOTION]);
- }
- if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
- pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
- pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_BACKGROUND]);
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_LIGHT]);
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE]);
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_VOLUME]);
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_SHADOW]);
+ pipeline_groups.push_back(groups[PG_RGEN_EVAL_DISPLACE]);
+ pipeline_groups.push_back(groups[PG_RGEN_EVAL_BACKGROUND]);
+ pipeline_groups.push_back(groups[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY]);
+
+ for (const OptixProgramGroup &group : osl_groups) {
+ if (group != NULL) {
+ pipeline_groups.push_back(group);
+ }
}
optix_assert(optixPipelineCreate(context,
@@ -807,26 +1094,30 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
pipeline_groups.size(),
nullptr,
0,
- &pipelines[PIP_INTERSECT]));
+ &pipelines[PIP_SHADE]));
- /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
- const unsigned int css =
- std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
- std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
- std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
- stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
- link_options.maxTraceDepth * trace_css;
+ unsigned int dss = 0;
+ for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
+ dss = std::max(dss, osl_stack_size[i].dssDC);
+ }
- optix_assert(
- optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+ optix_assert(optixPipelineSetStackSize(
+ pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
}
- /* Clean up program group objects. */
- for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
- optixProgramGroupDestroy(groups[i]);
- }
+ return !have_error();
+# else
+ return false;
+# endif
+}
- return true;
+void *OptiXDevice::get_cpu_osl_memory()
+{
+# ifdef WITH_OSL
+ return &osl_globals;
+# else
+ return NULL;
+# endif
}
/* --------------------------------------------------------------------
@@ -1553,7 +1844,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
size_t num_motion_steps = 1;
Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
- if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+ if (pipeline_options.usesMotionBlur && hair->get_use_motion_blur() && motion_keys) {
num_motion_steps = hair->get_motion_steps();
}
@@ -1707,7 +1998,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
size_t num_motion_steps = 1;
Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
- if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+ if (pipeline_options.usesMotionBlur && mesh->get_use_motion_blur() && motion_keys) {
num_motion_steps = mesh->get_motion_steps();
}
@@ -1774,7 +2065,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
size_t num_motion_steps = 1;
Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
- if (motion_blur && pointcloud->get_use_motion_blur() && motion_points) {
+ if (pipeline_options.usesMotionBlur && pointcloud->get_use_motion_blur() && motion_points) {
num_motion_steps = pointcloud->get_motion_steps();
}
@@ -1871,7 +2162,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
/* Calculate total motion transform size and allocate memory for them. */
size_t motion_transform_offset = 0;
- if (motion_blur) {
+ if (pipeline_options.usesMotionBlur) {
size_t total_motion_transform_size = 0;
for (Object *const ob : bvh->objects) {
if (ob->is_traceable() && ob->use_motion()) {
@@ -1922,7 +2213,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
if (ob->get_geometry()->geometry_type == Geometry::HAIR &&
static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
- if (motion_blur && ob->get_geometry()->has_motion_blur()) {
+ if (pipeline_options.usesMotionBlur && ob->get_geometry()->has_motion_blur()) {
/* Select between motion blur and non-motion blur built-in intersection module. */
instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
}
@@ -1950,7 +2241,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
}
/* Insert motion traversable if object has motion. */
- if (motion_blur && ob->use_motion()) {
+ if (pipeline_options.usesMotionBlur && ob->use_motion()) {
size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
motion_keys * sizeof(OptixSRTData);
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index 817afdc8384..ad0e7b93454 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -9,6 +9,7 @@
# include "device/cuda/device_impl.h"
# include "device/optix/queue.h"
# include "device/optix/util.h"
+# include "kernel/osl/globals.h"
# include "kernel/types.h"
# include "util/unique_ptr.h"
@@ -23,8 +24,16 @@ enum {
PG_RGEN_INTERSECT_SHADOW,
PG_RGEN_INTERSECT_SUBSURFACE,
PG_RGEN_INTERSECT_VOLUME_STACK,
+ PG_RGEN_SHADE_BACKGROUND,
+ PG_RGEN_SHADE_LIGHT,
+ PG_RGEN_SHADE_SURFACE,
PG_RGEN_SHADE_SURFACE_RAYTRACE,
PG_RGEN_SHADE_SURFACE_MNEE,
+ PG_RGEN_SHADE_VOLUME,
+ PG_RGEN_SHADE_SHADOW,
+ PG_RGEN_EVAL_DISPLACE,
+ PG_RGEN_EVAL_BACKGROUND,
+ PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY,
PG_MISS,
PG_HITD, /* Default hit group. */
PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
@@ -40,14 +49,14 @@ enum {
};
static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
-static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int NUM_MISS_PROGRAM_GROUPS = 1;
static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
static const int NUM_HIT_PROGRAM_GROUPS = 8;
static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
static const int NUM_CALLABLE_PROGRAM_GROUPS = 2;
/* List of OptiX pipelines. */
-enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES };
+enum { PIP_SHADE, PIP_INTERSECT, NUM_PIPELINES };
/* A single shader binding table entry. */
struct SbtRecord {
@@ -61,12 +70,20 @@ class OptiXDevice : public CUDADevice {
OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
OptixModule builtin_modules[2] = {};
OptixPipeline pipelines[NUM_PIPELINES] = {};
+ OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+ OptixPipelineCompileOptions pipeline_options = {};
- bool motion_blur = false;
device_vector<SbtRecord> sbt_data;
device_only_memory<KernelParamsOptiX> launch_params;
- OptixTraversableHandle tlas_handle = 0;
+# ifdef WITH_OSL
+ OSLGlobals osl_globals;
+ vector<OptixModule> osl_modules;
+ vector<OptixProgramGroup> osl_groups;
+# endif
+
+ private:
+ OptixTraversableHandle tlas_handle = 0;
vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
thread_mutex delayed_free_bvh_mutex;
@@ -100,13 +117,14 @@ class OptiXDevice : public CUDADevice {
OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
~OptiXDevice();
- private:
BVHLayoutMask get_bvh_layout_mask() const override;
- string compile_kernel_get_common_cflags(const uint kernel_features) override;
+ string compile_kernel_get_common_cflags(const uint kernel_features);
bool load_kernels(const uint kernel_features) override;
+ bool load_osl_kernels() override;
+
bool build_optix_bvh(BVHOptiX *bvh,
OptixBuildOperation operation,
const OptixBuildInput &build_input,
@@ -123,6 +141,8 @@ class OptiXDevice : public CUDADevice {
virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+ void *get_cpu_osl_memory() override;
+
/* --------------------------------------------------------------------
* Denoising.
*/
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 3bc547ed11d..1bfd154d449 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -24,21 +24,33 @@ void OptiXDeviceQueue::init_execution()
CUDADeviceQueue::init_execution();
}
-static bool is_optix_specific_kernel(DeviceKernel kernel)
+static bool is_optix_specific_kernel(DeviceKernel kernel, bool use_osl)
{
- return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
- kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
- kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
- kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
- kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
- kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+# ifdef WITH_OSL
+ /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+ if (use_osl && device_kernel_has_shading(kernel)) {
+ return true;
+ }
+# else
+ (void)use_osl;
+# endif
+
+ return device_kernel_has_intersection(kernel);
}
bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
const int work_size,
DeviceKernelArguments const &args)
{
- if (!is_optix_specific_kernel(kernel)) {
+ OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+# ifdef WITH_OSL
+ const bool use_osl = static_cast<OSLGlobals *>(optix_device->get_cpu_osl_memory())->use;
+# else
+ const bool use_osl = false;
+# endif
+
+ if (!is_optix_specific_kernel(kernel, use_osl)) {
return CUDADeviceQueue::enqueue(kernel, work_size, args);
}
@@ -50,8 +62,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
const CUDAContextScope scope(cuda_device_);
- OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
-
const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
@@ -62,9 +72,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
sizeof(device_ptr),
cuda_stream_));
- if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
- kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
- kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel_has_shading(kernel)) {
cuda_device_assert(
cuda_device_,
cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
@@ -72,6 +80,15 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
sizeof(device_ptr),
cuda_stream_));
}
+ if (kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+ kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+ kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY) {
+ cuda_device_assert(cuda_device_,
+ cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, offset),
+ args.values[2], // &d_offset
+ sizeof(int32_t),
+ cuda_stream_));
+ }
cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
@@ -79,14 +96,35 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
OptixShaderBindingTable sbt_params = {};
switch (kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+ pipeline = optix_device->pipelines[PIP_SHADE];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_BACKGROUND * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+ pipeline = optix_device->pipelines[PIP_SHADE];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_LIGHT * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+ pipeline = optix_device->pipelines[PIP_SHADE];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE * sizeof(SbtRecord);
+ break;
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
- pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+ pipeline = optix_device->pipelines[PIP_SHADE];
sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
break;
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
- pipeline = optix_device->pipelines[PIP_SHADE_MNEE];
+ pipeline = optix_device->pipelines[PIP_SHADE];
sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord);
break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+ pipeline = optix_device->pipelines[PIP_SHADE];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_VOLUME * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+ pipeline = optix_device->pipelines[PIP_SHADE];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SHADOW * sizeof(SbtRecord);
+ break;
+
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
pipeline = optix_device->pipelines[PIP_INTERSECT];
sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
@@ -104,6 +142,20 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
break;
+ case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+ pipeline = optix_device->pipelines[PIP_SHADE];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_DISPLACE * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+ pipeline = optix_device->pipelines[PIP_SHADE];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_BACKGROUND * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
+ pipeline = optix_device->pipelines[PIP_SHADE];
+ sbt_params.raygenRecord = sbt_data_ptr +
+ PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY * sizeof(SbtRecord);
+ break;
+
default:
LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
<< " is attempted to be enqueued.";
@@ -112,7 +164,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
- sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+ sbt_params.missRecordCount = NUM_MISS_PROGRAM_GROUPS;
sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
@@ -120,6 +172,12 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
+# ifdef WITH_OSL
+ if (use_osl) {
+ sbt_params.callablesRecordCount += static_cast<unsigned int>(optix_device->osl_groups.size());
+ }
+# endif
+
/* Launch the ray generation program. */
optix_device_assert(optix_device,
optixLaunch(pipeline,
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3779fdc697a..99f9e536977 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -37,6 +37,14 @@ set(SRC_KERNEL_DEVICE_OPTIX
device/optix/kernel_shader_raytrace.cu
)
+if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+ set(SRC_KERNEL_DEVICE_OPTIX
+ ${SRC_KERNEL_DEVICE_OPTIX}
+ osl/services_optix.cu
+ device/optix/kernel_osl.cu
+ )
+endif()
+
set(SRC_KERNEL_DEVICE_ONEAPI
device/oneapi/kernel.cpp
)
@@ -181,6 +189,16 @@ set(SRC_KERNEL_SVM_HEADERS
svm/vertex_color.h
)
+if(WITH_CYCLES_OSL)
+ set(SRC_KERNEL_OSL_HEADERS
+ osl/osl.h
+ osl/closures_setup.h
+ osl/closures_template.h
+ osl/services_gpu.h
+ osl/types.h
+ )
+endif()
+
set(SRC_KERNEL_GEOM_HEADERS
geom/geom.h
geom/attribute.h
@@ -306,6 +324,7 @@ set(SRC_KERNEL_HEADERS
${SRC_KERNEL_GEOM_HEADERS}
${SRC_KERNEL_INTEGRATOR_HEADERS}
${SRC_KERNEL_LIGHT_HEADERS}
+ ${SRC_KERNEL_OSL_HEADERS}
${SRC_KERNEL_SAMPLE_HEADERS}
${SRC_KERNEL_SVM_HEADERS}
${SRC_KERNEL_TYPES_HEADERS}
@@ -328,6 +347,7 @@ set(SRC_UTIL_HEADERS
../util/math_int2.h
../util/math_int3.h
../util/math_int4.h
+ ../util/math_int8.h
../util/math_matrix.h
../util/projection.h
../util/rect.h
@@ -350,6 +370,8 @@ set(SRC_UTIL_HEADERS
../util/types_int3_impl.h
../util/types_int4.h
../util/types_int4_impl.h
+ ../util/types_int8.h
+ ../util/types_int8_impl.h
../util/types_spectrum.h
../util/types_uchar2.h
../util/types_uchar2_impl.h
@@ -705,6 +727,16 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
kernel_optix_shader_raytrace
"device/optix/kernel_shader_raytrace.cu"
"--keep-device-functions")
+ if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+ CYCLES_OPTIX_KERNEL_ADD(
+ kernel_optix_osl
+ "device/optix/kernel_osl.cu"
+ "--relocatable-device-code=true")
+ CYCLES_OPTIX_KERNEL_ADD(
+ kernel_optix_osl_services
+ "osl/services_optix.cu"
+ "--relocatable-device-code=true")
+ endif()
add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
cycles_set_solution_folder(cycles_kernel_optix)
@@ -992,6 +1024,7 @@ source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
source_group("kernel" FILES ${SRC_KERNEL_TYPES_HEADERS})
source_group("light" FILES ${SRC_KERNEL_LIGHT_HEADERS})
+source_group("osl" FILES ${SRC_KERNEL_OSL_HEADERS})
source_group("sample" FILES ${SRC_KERNEL_SAMPLE_HEADERS})
source_group("svm" FILES ${SRC_KERNEL_SVM_HEADERS})
source_group("util" FILES ${SRC_KERNEL_UTIL_HEADERS})
@@ -1028,6 +1061,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLE
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_LIGHT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/light)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_OSL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/osl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SAMPLE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/sample)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_TYPES_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 71af68aa80e..2f5c5d7bd0c 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -297,8 +297,10 @@ ccl_device_inline void bsdf_roughness_eta(const KernelGlobals kg,
ccl_private float2 *roughness,
ccl_private float *eta)
{
+#ifdef __SVM__
bool refractive = false;
float alpha = 1.0f;
+#endif
switch (sc->type) {
case CLOSURE_BSDF_DIFFUSE_ID:
*roughness = one_float2();
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index 01087c96dd6..558431961ab 100644
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -7,6 +7,7 @@
* one with SSE2 intrinsics.
*/
#if defined(__x86_64__) || defined(_M_X64)
+# define __KERNEL_SSE__
# define __KERNEL_SSE2__
#endif
@@ -29,11 +30,15 @@
# define __KERNEL_SSE41__
# endif
# ifdef __AVX__
-# define __KERNEL_SSE__
+# ifndef __KERNEL_SSE__
+# define __KERNEL_SSE__
+# endif
# define __KERNEL_AVX__
# endif
# ifdef __AVX2__
-# define __KERNEL_SSE__
+# ifndef __KERNEL_SSE__
+# define __KERNEL_SSE__
+# endif
# define __KERNEL_AVX2__
# endif
#endif
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 51e1381d552..3a950779c11 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -30,6 +30,7 @@ typedef unsigned long long uint64_t;
/* Qualifiers */
#define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
#if __CUDA_ARCH__ < 500
# define ccl_device_inline __device__ __forceinline__
# define ccl_device_forceinline __device__ __forceinline__
@@ -109,14 +110,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D
typedef unsigned short half;
-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
{
half val;
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
return val;
}
-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
{
float val;
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 648988c31b6..8755395c82c 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -28,6 +28,7 @@ typedef unsigned long long uint64_t;
/* Qualifiers */
#define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
#define ccl_device_inline __device__ __inline__
#define ccl_device_forceinline __device__ __forceinline__
#define ccl_device_noinline __device__ __noinline__
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index f689e93e5a2..2dd6cc98b59 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -38,6 +38,7 @@ using namespace metal::raytracing;
# define ccl_device_noinline ccl_device __attribute__((noinline))
#endif
+#define ccl_device_extern extern "C"
#define ccl_device_noinline_cpu ccl_device
#define ccl_device_inline_method ccl_device
#define ccl_global device
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index dfaec65130c..b83512180d7 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -28,6 +28,7 @@
/* Qualifier wrappers for different names on different devices */
#define ccl_device
+#define ccl_device_extern extern "C"
#define ccl_global
#define ccl_always_inline __attribute__((always_inline))
#define ccl_device_inline inline
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index 1a11a533b7e..e13101f57b8 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -33,14 +33,16 @@ typedef unsigned long long uint64_t;
#endif
#define ccl_device \
- __device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything
+ static __device__ \
+ __forceinline__ // Function calls are bad for OptiX performance, so inline everything
+#define ccl_device_extern extern "C" __device__
#define ccl_device_inline ccl_device
#define ccl_device_forceinline ccl_device
-#define ccl_device_inline_method ccl_device
-#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_inline_method __device__ __forceinline__
+#define ccl_device_noinline static __device__ __noinline__
#define ccl_device_noinline_cpu ccl_device
#define ccl_global
-#define ccl_inline_constant __constant__
+#define ccl_inline_constant static __constant__
#define ccl_device_constant __constant__ __device__
#define ccl_constant const
#define ccl_gpu_shared __shared__
@@ -57,23 +59,6 @@ typedef unsigned long long uint64_t;
#define kernel_assert(cond)
-/* GPU thread, block, grid size and index */
-
-#define ccl_gpu_thread_idx_x (threadIdx.x)
-#define ccl_gpu_block_dim_x (blockDim.x)
-#define ccl_gpu_block_idx_x (blockIdx.x)
-#define ccl_gpu_grid_dim_x (gridDim.x)
-#define ccl_gpu_warp_size (warpSize)
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
-#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
-
-/* GPU warp synchronization. */
-
-#define ccl_gpu_syncthreads() __syncthreads()
-#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
-
/* GPU texture objects */
typedef unsigned long long CUtexObject;
@@ -101,14 +86,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D
typedef unsigned short half;
-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
{
half val;
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
return val;
}
-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
{
float val;
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
index 7af2e421378..126df74bc8c 100644
--- a/intern/cycles/kernel/device/optix/globals.h
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -25,6 +25,7 @@ struct KernelParamsOptiX {
/* Kernel arguments */
const int *path_index_array;
float *render_buffer;
+ int offset;
/* Global scene data and textures */
KernelData data;
@@ -36,7 +37,11 @@ struct KernelParamsOptiX {
};
#ifdef __NVCC__
-extern "C" static __constant__ KernelParamsOptiX kernel_params;
+extern "C"
+# ifndef __CUDACC_RDC__
+ static
+# endif
+ __constant__ KernelParamsOptiX kernel_params;
#endif
/* Abstraction macros */
diff --git a/intern/cycles/kernel/device/optix/kernel_osl.cu b/intern/cycles/kernel/device/optix/kernel_osl.cu
new file mode 100644
index 00000000000..0f3f477935b
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel_osl.cu
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+/* Copy of the regular OptiX kernels with additional OSL support. */
+
+#include "kernel/device/optix/kernel_shader_raytrace.cu"
+
+#include "kernel/bake/bake.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_volume.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_background()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (kernel_params.path_index_array) ?
+ kernel_params.path_index_array[global_index] :
+ global_index;
+ integrator_shade_background(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_light()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (kernel_params.path_index_array) ?
+ kernel_params.path_index_array[global_index] :
+ global_index;
+ integrator_shade_light(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (kernel_params.path_index_array) ?
+ kernel_params.path_index_array[global_index] :
+ global_index;
+ integrator_shade_surface(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_volume()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (kernel_params.path_index_array) ?
+ kernel_params.path_index_array[global_index] :
+ global_index;
+ integrator_shade_volume(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_shadow()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (kernel_params.path_index_array) ?
+ kernel_params.path_index_array[global_index] :
+ global_index;
+ integrator_shade_shadow(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_displace()
+{
+ KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+ float *const output = kernel_params.render_buffer;
+ const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+ kernel_displace_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_background()
+{
+ KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+ float *const output = kernel_params.render_buffer;
+ const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+ kernel_background_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_curve_shadow_transparency()
+{
+ KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+ float *const output = kernel_params.render_buffer;
+ const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+ kernel_curve_shadow_transparency_evaluate(nullptr, input, output, global_index);
+}
diff --git a/intern/cycles/kernel/integrator/displacement_shader.h b/intern/cycles/kernel/integrator/displacement_shader.h
index 839dfe244ac..a6e9d674396 100644
--- a/intern/cycles/kernel/integrator/displacement_shader.h
+++ b/intern/cycles/kernel/integrator/displacement_shader.h
@@ -24,8 +24,8 @@ ccl_device void displacement_shader_eval(KernelGlobals kg,
/* this will modify sd->P */
#ifdef __OSL__
- if (kg->osl) {
- OSLShader::eval_displacement(kg, state, sd);
+ if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+ osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(kg, state, sd, 0);
}
else
#endif
diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h
index 667ba949760..cc3fbe3fe39 100644
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -156,6 +156,13 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
u = v;
v = 1.0f - tmp - v;
+ const float tmpdx = dudx;
+ const float tmpdy = dudy;
+ dudx = dvdx;
+ dudy = dvdy;
+ dvdx = -tmpdx - dvdx;
+ dvdy = -tmpdy - dvdy;
+
/* Position and normal on triangle. */
const int object = kernel_data.bake.object_index;
float3 P, Ng;
diff --git a/intern/cycles/kernel/integrator/surface_shader.h b/intern/cycles/kernel/integrator/surface_shader.h
index 6c0097b11bd..5e47a34f77e 100644
--- a/intern/cycles/kernel/integrator/surface_shader.h
+++ b/intern/cycles/kernel/integrator/surface_shader.h
@@ -827,13 +827,8 @@ ccl_device void surface_shader_eval(KernelGlobals kg,
sd->num_closure_left = max_closures;
#ifdef __OSL__
- if (kg->osl) {
- if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
- OSLShader::eval_background(kg, state, sd, path_flag);
- }
- else {
- OSLShader::eval_surface(kg, state, sd, path_flag);
- }
+ if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+ osl_eval_nodes<SHADER_TYPE_SURFACE>(kg, state, sd, path_flag);
}
else
#endif
diff --git a/intern/cycles/kernel/integrator/volume_shader.h b/intern/cycles/kernel/integrator/volume_shader.h
index 0ff968723a1..f9050647c6d 100644
--- a/intern/cycles/kernel/integrator/volume_shader.h
+++ b/intern/cycles/kernel/integrator/volume_shader.h
@@ -493,8 +493,8 @@ ccl_device_inline void volume_shader_eval(KernelGlobals kg,
/* evaluate shader */
# ifdef __OSL__
- if (kg->osl) {
- OSLShader::eval_volume(kg, state, sd, path_flag);
+ if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+ osl_eval_nodes<SHADER_TYPE_VOLUME>(kg, state, sd, path_flag);
}
else
# endif
diff --git a/intern/cycles/kernel/osl/closures.cpp b/intern/cycles/kernel/osl/closures.cpp
index d56e0551a91..6800c765345 100644
--- a/intern/cycles/kernel/osl/closures.cpp
+++ b/intern/cycles/kernel/osl/closures.cpp
@@ -25,13 +25,18 @@
#include "kernel/osl/osl.h"
-#include "kernel/osl/closures_setup.h"
-
#define TO_VEC3(v) OSL::Vec3(v.x, v.y, v.z)
#define TO_FLOAT3(v) make_float3(v[0], v[1], v[2])
CCL_NAMESPACE_BEGIN
+static_assert(sizeof(OSLClosure) == sizeof(OSL::ClosureColor) &&
+ sizeof(OSLClosureAdd) == sizeof(OSL::ClosureAdd) &&
+ sizeof(OSLClosureMul) == sizeof(OSL::ClosureMul) &&
+ sizeof(OSLClosureComponent) == sizeof(OSL::ClosureComponent));
+static_assert(sizeof(ShaderGlobals) == sizeof(OSL::ShaderGlobals) &&
+ offsetof(ShaderGlobals, Ci) == offsetof(OSL::ShaderGlobals, Ci));
+
/* Registration */
#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
@@ -60,53 +65,18 @@ void OSLRenderServices::register_closures(OSL::ShadingSystem *ss)
#include "closures_template.h"
}
-/* Globals */
+/* Surface & Background */
-static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
- ShaderData *sd,
- const void *state,
- uint32_t path_flag,
- OSLThreadData *tdata)
+template<>
+void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
+ const void *state,
+ ShaderData *sd,
+ uint32_t path_flag)
{
- OSL::ShaderGlobals *globals = &tdata->globals;
-
- const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
- const differential3 dI = differential_from_compact(sd->I, sd->dI);
-
- /* copy from shader data to shader globals */
- globals->P = TO_VEC3(sd->P);
- globals->dPdx = TO_VEC3(dP.dx);
- globals->dPdy = TO_VEC3(dP.dy);
- globals->I = TO_VEC3(sd->I);
- globals->dIdx = TO_VEC3(dI.dx);
- globals->dIdy = TO_VEC3(dI.dy);
- globals->N = TO_VEC3(sd->N);
- globals->Ng = TO_VEC3(sd->Ng);
- globals->u = sd->u;
- globals->dudx = sd->du.dx;
- globals->dudy = sd->du.dy;
- globals->v = sd->v;
- globals->dvdx = sd->dv.dx;
- globals->dvdy = sd->dv.dy;
- globals->dPdu = TO_VEC3(sd->dPdu);
- globals->dPdv = TO_VEC3(sd->dPdv);
- globals->surfacearea = 1.0f;
- globals->time = sd->time;
-
- /* booleans */
- globals->raytype = path_flag;
- globals->flipHandedness = 0;
- globals->backfacing = (sd->flag & SD_BACKFACING);
-
- /* shader data to be used in services callbacks */
- globals->renderstate = sd;
-
- /* hacky, we leave it to services to fetch actual object matrix */
- globals->shader2common = sd;
- globals->object2common = sd;
-
- /* must be set to NULL before execute */
- globals->Ci = NULL;
+ /* setup shader globals from shader data */
+ OSLThreadData *tdata = kg->osl_tdata;
+ shaderdata_to_shaderglobals(
+ kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
/* clear trace data */
tdata->tracedata.init = false;
@@ -121,53 +91,6 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
sd->osl_path_state = (const IntegratorStateCPU *)state;
sd->osl_shadow_path_state = nullptr;
}
-}
-
-static void flatten_closure_tree(const KernelGlobalsCPU *kg,
- ShaderData *sd,
- uint32_t path_flag,
- const OSL::ClosureColor *closure,
- float3 weight = make_float3(1.0f, 1.0f, 1.0f))
-{
- /* OSL gives us a closure tree, we flatten it into arrays per
- * closure type, for evaluation, sampling, etc later on. */
-
- switch (closure->id) {
- case OSL::ClosureColor::MUL: {
- OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
- flatten_closure_tree(kg, sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
- break;
- }
- case OSL::ClosureColor::ADD: {
- OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
- flatten_closure_tree(kg, sd, path_flag, add->closureA, weight);
- flatten_closure_tree(kg, sd, path_flag, add->closureB, weight);
- break;
- }
-#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
- case OSL_CLOSURE_##Upper##_ID: { \
- const OSL::ClosureComponent *comp = reinterpret_cast<const OSL::ClosureComponent *>(closure); \
- weight *= TO_FLOAT3(comp->w); \
- osl_closure_##lower##_setup( \
- kg, sd, path_flag, weight, reinterpret_cast<const Upper##Closure *>(comp + 1)); \
- break; \
- }
-#include "closures_template.h"
- default:
- break;
- }
-}
-
-/* Surface */
-
-void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
- const void *state,
- ShaderData *sd,
- uint32_t path_flag)
-{
- /* setup shader globals from shader data */
- OSLThreadData *tdata = kg->osl_tdata;
- shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
/* execute shader for this point */
OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
@@ -175,101 +98,99 @@ void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
OSL::ShadingContext *octx = tdata->context;
int shader = sd->shader & SHADER_MASK;
- /* automatic bump shader */
- if (kg->osl->bump_state[shader]) {
- /* save state */
- const float3 P = sd->P;
- const float dP = sd->dP;
- const OSL::Vec3 dPdx = globals->dPdx;
- const OSL::Vec3 dPdy = globals->dPdy;
-
- /* set state as if undisplaced */
- if (sd->flag & SD_HAS_DISPLACEMENT) {
- float data[9];
- bool found = kg->osl->services->get_attribute(sd,
- true,
- OSLRenderServices::u_empty,
- TypeDesc::TypeVector,
- OSLRenderServices::u_geom_undisplaced,
- data);
- (void)found;
- assert(found);
-
- differential3 tmp_dP;
- memcpy(&sd->P, data, sizeof(float) * 3);
- memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
- memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);
-
- object_position_transform(kg, sd, &sd->P);
- object_dir_transform(kg, sd, &tmp_dP.dx);
- object_dir_transform(kg, sd, &tmp_dP.dy);
-
- sd->dP = differential_make_compact(tmp_dP);
-
- globals->P = TO_VEC3(sd->P);
- globals->dPdx = TO_VEC3(tmp_dP.dx);
- globals->dPdy = TO_VEC3(tmp_dP.dy);
+ if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
+ /* background */
+ if (kg->osl->background_state) {
+ ss->execute(octx, *(kg->osl->background_state), *globals);
}
-
- /* execute bump shader */
- ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
-
- /* reset state */
- sd->P = P;
- sd->dP = dP;
-
- globals->P = TO_VEC3(P);
- globals->dPdx = TO_VEC3(dPdx);
- globals->dPdy = TO_VEC3(dPdy);
}
+ else {
+ /* automatic bump shader */
+ if (kg->osl->bump_state[shader]) {
+ /* save state */
+ const float3 P = sd->P;
+ const float dP = sd->dP;
+ const OSL::Vec3 dPdx = globals->dPdx;
+ const OSL::Vec3 dPdy = globals->dPdy;
+
+ /* set state as if undisplaced */
+ if (sd->flag & SD_HAS_DISPLACEMENT) {
+ float data[9];
+ bool found = kg->osl->services->get_attribute(sd,
+ true,
+ OSLRenderServices::u_empty,
+ TypeDesc::TypeVector,
+ OSLRenderServices::u_geom_undisplaced,
+ data);
+ (void)found;
+ assert(found);
+
+ differential3 tmp_dP;
+ memcpy(&sd->P, data, sizeof(float) * 3);
+ memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
+ memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);
+
+ object_position_transform(kg, sd, &sd->P);
+ object_dir_transform(kg, sd, &tmp_dP.dx);
+ object_dir_transform(kg, sd, &tmp_dP.dy);
+
+ sd->dP = differential_make_compact(tmp_dP);
+
+ globals->P = TO_VEC3(sd->P);
+ globals->dPdx = TO_VEC3(tmp_dP.dx);
+ globals->dPdy = TO_VEC3(tmp_dP.dy);
+ }
+
+ /* execute bump shader */
+ ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
+
+ /* reset state */
+ sd->P = P;
+ sd->dP = dP;
+
+ globals->P = TO_VEC3(P);
+ globals->dPdx = TO_VEC3(dPdx);
+ globals->dPdy = TO_VEC3(dPdy);
+ }
- /* surface shader */
- if (kg->osl->surface_state[shader]) {
- ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+ /* surface shader */
+ if (kg->osl->surface_state[shader]) {
+ ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+ }
}
/* flatten closure tree */
if (globals->Ci) {
- flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+ flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci));
}
}
-/* Background */
+/* Volume */
-void OSLShader::eval_background(const KernelGlobalsCPU *kg,
- const void *state,
- ShaderData *sd,
- uint32_t path_flag)
+template<>
+void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
+ const void *state,
+ ShaderData *sd,
+ uint32_t path_flag)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
- shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
+ shaderdata_to_shaderglobals(
+ kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
- /* execute shader for this point */
- OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
- OSL::ShaderGlobals *globals = &tdata->globals;
- OSL::ShadingContext *octx = tdata->context;
+ /* clear trace data */
+ tdata->tracedata.init = false;
- if (kg->osl->background_state) {
- ss->execute(octx, *(kg->osl->background_state), *globals);
+ /* Used by render-services. */
+ sd->osl_globals = kg;
+ if (path_flag & PATH_RAY_SHADOW) {
+ sd->osl_path_state = nullptr;
+ sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
}
-
- /* return background color immediately */
- if (globals->Ci) {
- flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+ else {
+ sd->osl_path_state = (const IntegratorStateCPU *)state;
+ sd->osl_shadow_path_state = nullptr;
}
-}
-
-/* Volume */
-
-void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
- const void *state,
- ShaderData *sd,
- uint32_t path_flag)
-{
- /* setup shader globals from shader data */
- OSLThreadData *tdata = kg->osl_tdata;
- shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
/* execute shader */
OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
@@ -283,17 +204,30 @@ void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
/* flatten closure tree */
if (globals->Ci) {
- flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+ flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci));
}
}
/* Displacement */
-void OSLShader::eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd)
+template<>
+void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const KernelGlobalsCPU *kg,
+ const void *state,
+ ShaderData *sd,
+ uint32_t path_flag)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
- shaderdata_to_shaderglobals(kg, sd, state, 0, tdata);
+ shaderdata_to_shaderglobals(
+ kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+
+ /* clear trace data */
+ tdata->tracedata.init = false;
+
+ /* Used by render-services. */
+ sd->osl_globals = kg;
+ sd->osl_path_state = (const IntegratorStateCPU *)state;
+ sd->osl_shadow_path_state = nullptr;
/* execute shader */
OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
diff --git a/intern/cycles/kernel/osl/closures_setup.h b/intern/cycles/kernel/osl/closures_setup.h
index 96c551b9951..ceaf56ccba6 100644
--- a/intern/cycles/kernel/osl/closures_setup.h
+++ b/intern/cycles/kernel/osl/closures_setup.h
@@ -40,12 +40,7 @@ CCL_NAMESPACE_BEGIN
const char *label;
#define OSL_CLOSURE_STRUCT_END(Upper, lower) \
} \
- ; \
- ccl_device void osl_closure_##lower##_setup(KernelGlobals kg, \
- ccl_private ShaderData *sd, \
- uint32_t path_flag, \
- float3 weight, \
- ccl_private Upper##Closure *closure);
+ ;
#define OSL_CLOSURE_STRUCT_MEMBER(Upper, TYPE, type, name, key) type name;
#define OSL_CLOSURE_STRUCT_ARRAY_MEMBER(Upper, TYPE, type, name, key, size) type name[size];
@@ -210,11 +205,9 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
bsdf->ior = closure->ior;
bsdf->T = closure->T;
- static OSL::ustring u_ggx("ggx");
- static OSL::ustring u_default("default");
-
/* GGX */
- if (closure->distribution == u_ggx || closure->distribution == u_default) {
+ if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
+ closure->distribution == make_string("default", 4430693559278735917ull)) {
if (!closure->refract) {
if (closure->alpha_x == closure->alpha_y) {
/* Isotropic */
@@ -1000,18 +993,14 @@ ccl_device void osl_closure_bssrdf_setup(KernelGlobals kg,
float3 weight,
ccl_private const BSSRDFClosure *closure)
{
- static ustring u_burley("burley");
- static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
- static ustring u_random_walk("random_walk");
-
ClosureType type;
- if (closure->method == u_burley) {
+ if (closure->method == make_string("burley", 186330084368958868ull)) {
type = CLOSURE_BSSRDF_BURLEY_ID;
}
- else if (closure->method == u_random_walk_fixed_radius) {
+ else if (closure->method == make_string("random_walk_fixed_radius", 5695810351010063150ull)) {
type = CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID;
}
- else if (closure->method == u_random_walk) {
+ else if (closure->method == make_string("random_walk", 11360609267673527222ull)) {
type = CLOSURE_BSSRDF_RANDOM_WALK_ID;
}
else {
diff --git a/intern/cycles/kernel/osl/closures_template.h b/intern/cycles/kernel/osl/closures_template.h
index c808b275966..b9e9b52dcf8 100644
--- a/intern/cycles/kernel/osl/closures_template.h
+++ b/intern/cycles/kernel/osl/closures_template.h
@@ -40,7 +40,7 @@ OSL_CLOSURE_STRUCT_BEGIN(Transparent, transparent)
OSL_CLOSURE_STRUCT_END(Transparent, transparent)
OSL_CLOSURE_STRUCT_BEGIN(Microfacet, microfacet)
- OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, ustring, distribution, NULL)
+ OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, DeviceString, distribution, NULL)
OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, N, NULL)
OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, T, NULL)
OSL_CLOSURE_STRUCT_MEMBER(Microfacet, FLOAT, float, alpha_x, NULL)
@@ -210,7 +210,7 @@ OSL_CLOSURE_STRUCT_BEGIN(PhongRamp, phong_ramp)
OSL_CLOSURE_STRUCT_END(PhongRamp, phong_ramp)
OSL_CLOSURE_STRUCT_BEGIN(BSSRDF, bssrdf)
- OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, ustring, method, NULL)
+ OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, DeviceString, method, NULL)
OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, N, NULL)
OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, radius, NULL)
OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, albedo, NULL)
diff --git a/intern/cycles/kernel/osl/osl.h b/intern/cycles/kernel/osl/osl.h
index bef23f3eea1..cc5c81ad027 100644
--- a/intern/cycles/kernel/osl/osl.h
+++ b/intern/cycles/kernel/osl/osl.h
@@ -1,38 +1,171 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Adapted from Open Shading Language
+ * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
+ * All Rights Reserved.
+ *
+ * Modifications Copyright 2011-2022 Blender Foundation. */
#pragma once
/* OSL Shader Engine
*
- * Holds all variables to execute and use OSL shaders from the kernel. These
- * are initialized externally by OSLShaderManager before rendering starts.
- *
- * Before/after a thread starts rendering, thread_init/thread_free must be
- * called, which will store any per thread OSL state in thread local storage.
- * This means no thread state must be passed along in the kernel itself.
+ * Holds all variables to execute and use OSL shaders from the kernel.
*/
#include "kernel/osl/types.h"
+#include "kernel/osl/closures_setup.h"
+
CCL_NAMESPACE_BEGIN
-class OSLShader {
- public:
- /* eval */
- static void eval_surface(const KernelGlobalsCPU *kg,
- const void *state,
- ShaderData *sd,
- uint32_t path_flag);
- static void eval_background(const KernelGlobalsCPU *kg,
- const void *state,
- ShaderData *sd,
- uint32_t path_flag);
- static void eval_volume(const KernelGlobalsCPU *kg,
- const void *state,
- ShaderData *sd,
- uint32_t path_flag);
- static void eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd);
-};
+ccl_device_inline void shaderdata_to_shaderglobals(KernelGlobals kg,
+ ccl_private ShaderData *sd,
+ uint32_t path_flag,
+ ccl_private ShaderGlobals *globals)
+{
+ const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
+ const differential3 dI = differential_from_compact(sd->I, sd->dI);
+
+ /* copy from shader data to shader globals */
+ globals->P = sd->P;
+ globals->dPdx = dP.dx;
+ globals->dPdy = dP.dy;
+ globals->I = sd->I;
+ globals->dIdx = dI.dx;
+ globals->dIdy = dI.dy;
+ globals->N = sd->N;
+ globals->Ng = sd->Ng;
+ globals->u = sd->u;
+ globals->dudx = sd->du.dx;
+ globals->dudy = sd->du.dy;
+ globals->v = sd->v;
+ globals->dvdx = sd->dv.dx;
+ globals->dvdy = sd->dv.dy;
+ globals->dPdu = sd->dPdu;
+ globals->dPdv = sd->dPdv;
+ globals->time = sd->time;
+ globals->dtime = 1.0f;
+ globals->surfacearea = 1.0f;
+ globals->raytype = path_flag;
+ globals->flipHandedness = 0;
+ globals->backfacing = (sd->flag & SD_BACKFACING);
+
+ /* shader data to be used in services callbacks */
+ globals->renderstate = sd;
+
+ /* hacky, we leave it to services to fetch actual object matrix */
+ globals->shader2common = sd;
+ globals->object2common = sd;
+
+ /* must be set to NULL before execute */
+ globals->Ci = nullptr;
+}
+
+ccl_device void flatten_closure_tree(KernelGlobals kg,
+ ccl_private ShaderData *sd,
+ uint32_t path_flag,
+ ccl_private const OSLClosure *closure)
+{
+ int stack_size = 0;
+ float3 weight = one_float3();
+ float3 weight_stack[16];
+ ccl_private const OSLClosure *closure_stack[16];
+
+ while (closure) {
+ switch (closure->id) {
+ case OSL_CLOSURE_MUL_ID: {
+ ccl_private const OSLClosureMul *mul = static_cast<ccl_private const OSLClosureMul *>(
+ closure);
+ weight *= mul->weight;
+ closure = mul->closure;
+ continue;
+ }
+ case OSL_CLOSURE_ADD_ID: {
+ if (stack_size >= 16) {
+ kernel_assert(!"Exhausted OSL closure stack");
+ break;
+ }
+ ccl_private const OSLClosureAdd *add = static_cast<ccl_private const OSLClosureAdd *>(
+ closure);
+ closure = add->closureA;
+ weight_stack[stack_size] = weight;
+ closure_stack[stack_size++] = add->closureB;
+ continue;
+ }
+#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
+ case OSL_CLOSURE_##Upper##_ID: { \
+ ccl_private const OSLClosureComponent *comp = \
+ static_cast<ccl_private const OSLClosureComponent *>(closure); \
+ osl_closure_##lower##_setup(kg, \
+ sd, \
+ path_flag, \
+ weight * comp->weight, \
+ reinterpret_cast<ccl_private const Upper##Closure *>(comp + 1)); \
+ break; \
+ }
+#include "closures_template.h"
+ default:
+ break;
+ }
+
+ if (stack_size > 0) {
+ weight = weight_stack[--stack_size];
+ closure = closure_stack[stack_size];
+ }
+ else {
+ closure = nullptr;
+ }
+ }
+}
+
+#ifndef __KERNEL_GPU__
+
+template<ShaderType type>
+void osl_eval_nodes(const KernelGlobalsCPU *kg,
+ const void *state,
+ ShaderData *sd,
+ uint32_t path_flag);
+
+#else
+
+template<ShaderType type, typename ConstIntegratorGenericState>
+ccl_device_inline void osl_eval_nodes(KernelGlobals kg,
+ ConstIntegratorGenericState state,
+ ccl_private ShaderData *sd,
+ uint32_t path_flag)
+{
+ ShaderGlobals globals;
+ shaderdata_to_shaderglobals(kg, sd, path_flag, &globals);
+
+ const int shader = sd->shader & SHADER_MASK;
+
+# ifdef __KERNEL_OPTIX__
+ uint8_t group_data[2048];
+ uint8_t closure_pool[1024];
+ sd->osl_closure_pool = closure_pool;
+
+ unsigned int optix_dc_index = 2 /* NUM_CALLABLE_PROGRAM_GROUPS */ +
+ (shader + type * kernel_data.max_shaders) * 2;
+ optixDirectCall<void>(optix_dc_index + 0,
+ /* shaderglobals_ptr = */ &globals,
+ /* groupdata_ptr = */ (void *)group_data,
+ /* userdata_base_ptr = */ (void *)nullptr,
+ /* output_base_ptr = */ (void *)nullptr,
+ /* shadeindex = */ 0);
+ optixDirectCall<void>(optix_dc_index + 1,
+ /* shaderglobals_ptr = */ &globals,
+ /* groupdata_ptr = */ (void *)group_data,
+ /* userdata_base_ptr = */ (void *)nullptr,
+ /* output_base_ptr = */ (void *)nullptr,
+ /* shadeindex = */ 0);
+# endif
+
+ if (globals.Ci) {
+ flatten_closure_tree(kg, sd, path_flag, globals.Ci);
+ }
+}
+
+#endif
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp
index b744422ee78..3fd098de4bb 100644
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -119,8 +119,8 @@ ustring OSLRenderServices::u_u("u");
ustring OSLRenderServices::u_v("v");
ustring OSLRenderServices::u_empty;
-OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system)
- : OSL::RendererServices(texture_system)
+OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system, int device_type)
+ : OSL::RendererServices(texture_system), device_type_(device_type)
{
}
@@ -131,6 +131,17 @@ OSLRenderServices::~OSLRenderServices()
}
}
+int OSLRenderServices::supports(string_view feature) const
+{
+#ifdef WITH_OPTIX
+ if (feature == "OptiX") {
+ return device_type_ == DEVICE_OPTIX;
+ }
+#endif
+
+ return false;
+}
+
bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
OSL::Matrix44 &result,
OSL::TransformationPtr xform,
@@ -1139,29 +1150,40 @@ TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring file
{
OSLTextureHandleMap::iterator it = textures.find(filename);
- /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
- if (it != textures.end()) {
- if (it->second->type != OSLTextureHandle::OIIO) {
- return (TextureSystem::TextureHandle *)it->second.get();
+ if (device_type_ == DEVICE_CPU) {
+ /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
+ if (it != textures.end()) {
+ if (it->second->type != OSLTextureHandle::OIIO) {
+ return (TextureSystem::TextureHandle *)it->second.get();
+ }
}
- }
- /* Get handle from OpenImageIO. */
- OSL::TextureSystem *ts = m_texturesys;
- TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
- if (handle == NULL) {
- return NULL;
- }
+ /* Get handle from OpenImageIO. */
+ OSL::TextureSystem *ts = m_texturesys;
+ TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
+ if (handle == NULL) {
+ return NULL;
+ }
+
+ /* Insert new OSLTextureHandle if needed. */
+ if (it == textures.end()) {
+ textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
+ it = textures.find(filename);
+ }
- /* Insert new OSLTextureHandle if needed. */
- if (it == textures.end()) {
- textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
- it = textures.find(filename);
+ /* Assign OIIO texture handle and return. */
+ it->second->oiio_handle = handle;
+ return (TextureSystem::TextureHandle *)it->second.get();
}
+ else {
+ if (it != textures.end() && it->second->type == OSLTextureHandle::SVM &&
+ it->second->svm_slots[0].w == -1) {
+ return reinterpret_cast<TextureSystem::TextureHandle *>(
+ static_cast<uintptr_t>(it->second->svm_slots[0].y + 1));
+ }
- /* Assign OIIO texture handle and return. */
- it->second->oiio_handle = handle;
- return (TextureSystem::TextureHandle *)it->second.get();
+ return NULL;
+ }
}
bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle)
diff --git a/intern/cycles/kernel/osl/services.h b/intern/cycles/kernel/osl/services.h
index 334b6682e34..9d875ae8e94 100644
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@@ -22,11 +22,8 @@ class PtexCache;
CCL_NAMESPACE_BEGIN
-class Object;
class Scene;
-class Shader;
struct ShaderData;
-struct float3;
struct KernelGlobalsCPU;
/* OSL Texture Handle
@@ -73,11 +70,13 @@ typedef OIIO::unordered_map_concurrent<ustring, OSLTextureHandleRef, ustringHash
class OSLRenderServices : public OSL::RendererServices {
public:
- OSLRenderServices(OSL::TextureSystem *texture_system);
+ OSLRenderServices(OSL::TextureSystem *texture_system, int device_type);
~OSLRenderServices();
static void register_closures(OSL::ShadingSystem *ss);
+ int supports(string_view feature) const override;
+
bool get_matrix(OSL::ShaderGlobals *sg,
OSL::Matrix44 &result,
OSL::TransformationPtr xform,
@@ -324,6 +323,9 @@ class OSLRenderServices : public OSL::RendererServices {
* and is required because texture handles are cached as part of the shared
* shading system. */
OSLTextureHandleMap textures;
+
+ private:
+ int device_type_;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/services_gpu.h b/intern/cycles/kernel/osl/services_gpu.h
new file mode 100644
index 00000000000..75cf39919a0
--- /dev/null
+++ b/intern/cycles/kernel/osl/services_gpu.h
@@ -0,0 +1,2176 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Adapted from Open Shading Language
+ * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
+ * All Rights Reserved.
+ *
+ * Modifications Copyright 2011-2022 Blender Foundation. */
+
+#include "kernel/tables.h"
+#include "kernel/util/differential.h"
+
+#include "kernel/osl/osl.h"
+
+namespace DeviceStrings {
+
+/* "" */
+ccl_device_constant DeviceString _emptystring_ = {0ull};
+/* "common" */
+ccl_device_constant DeviceString u_common = {14645198576927606093ull};
+/* "world" */
+ccl_device_constant DeviceString u_world = {16436542438370751598ull};
+/* "shader" */
+ccl_device_constant DeviceString u_shader = {4279676006089868ull};
+/* "object" */
+ccl_device_constant DeviceString u_object = {973692718279674627ull};
+/* "NDC" */
+ccl_device_constant DeviceString u_ndc = {5148305047403260775ull};
+/* "screen" */
+ccl_device_constant DeviceString u_screen = {14159088609039777114ull};
+/* "camera" */
+ccl_device_constant DeviceString u_camera = {2159505832145726196ull};
+/* "raster" */
+ccl_device_constant DeviceString u_raster = {7759263238610201778ull};
+/* "hsv" */
+ccl_device_constant DeviceString u_hsv = {2177035556331879497ull};
+/* "hsl" */
+ccl_device_constant DeviceString u_hsl = {7749766809258288148ull};
+/* "XYZ" */
+ccl_device_constant DeviceString u_xyz = {4957977063494975483ull};
+/* "xyY" */
+ccl_device_constant DeviceString u_xyy = {5138822319725660255ull};
+/* "sRGB" */
+ccl_device_constant DeviceString u_srgb = {15368599878474175032ull};
+/* "object:location" */
+ccl_device_constant DeviceString u_object_location = {7846190347358762897ull};
+/* "object:color" */
+ccl_device_constant DeviceString u_object_color = {12695623857059169556ull};
+/* "object:alpha" */
+ccl_device_constant DeviceString u_object_alpha = {11165053919428293151ull};
+/* "object:index" */
+ccl_device_constant DeviceString u_object_index = {6588325838217472556ull};
+/* "geom:dupli_generated" */
+ccl_device_constant DeviceString u_geom_dupli_generated = {6715607178003388908ull};
+/* "geom:dupli_uv" */
+ccl_device_constant DeviceString u_geom_dupli_uv = {1294253317490155849ull};
+/* "material:index" */
+ccl_device_constant DeviceString u_material_index = {741770758159634623ull};
+/* "object:random" */
+ccl_device_constant DeviceString u_object_random = {15789063994977955884ull};
+/* "particle:index" */
+ccl_device_constant DeviceString u_particle_index = {9489711748229903784ull};
+/* "particle:random" */
+ccl_device_constant DeviceString u_particle_random = {17993722202766855761ull};
+/* "particle:age" */
+ccl_device_constant DeviceString u_particle_age = {7380730644710951109ull};
+/* "particle:lifetime" */
+ccl_device_constant DeviceString u_particle_lifetime = {16576828923156200061ull};
+/* "particle:location" */
+ccl_device_constant DeviceString u_particle_location = {10309536211423573010ull};
+/* "particle:rotation" */
+ccl_device_constant DeviceString u_particle_rotation = {17858543768041168459ull};
+/* "particle:size" */
+ccl_device_constant DeviceString u_particle_size = {16461524249715420389ull};
+/* "particle:velocity" */
+ccl_device_constant DeviceString u_particle_velocity = {13199101248768308863ull};
+/* "particle:angular_velocity" */
+ccl_device_constant DeviceString u_particle_angular_velocity = {16327930120486517910ull};
+/* "geom:numpolyvertices" */
+ccl_device_constant DeviceString u_geom_numpolyvertices = {382043551489988826ull};
+/* "geom:trianglevertices" */
+ccl_device_constant DeviceString u_geom_trianglevertices = {17839267571524187074ull};
+/* "geom:polyvertices" */
+ccl_device_constant DeviceString u_geom_polyvertices = {1345577201967881769ull};
+/* "geom:name" */
+ccl_device_constant DeviceString u_geom_name = {13606338128269760050ull};
+/* "geom:undisplaced" */
+ccl_device_constant DeviceString u_geom_undisplaced = {12431586303019276305ull};
+/* "geom:is_smooth" */
+ccl_device_constant DeviceString u_is_smooth = {857544214094480123ull};
+/* "geom:is_curve" */
+ccl_device_constant DeviceString u_is_curve = {129742495633653138ull};
+/* "geom:curve_thickness" */
+ccl_device_constant DeviceString u_curve_thickness = {10605802038397633852ull};
+/* "geom:curve_length" */
+ccl_device_constant DeviceString u_curve_length = {11423459517663715453ull};
+/* "geom:curve_tangent_normal" */
+ccl_device_constant DeviceString u_curve_tangent_normal = {12301397394034985633ull};
+/* "geom:curve_random" */
+ccl_device_constant DeviceString u_curve_random = {15293085049960492358ull};
+/* "geom:is_point" */
+ccl_device_constant DeviceString u_is_point = {2511357849436175953ull};
+/* "geom:point_radius" */
+ccl_device_constant DeviceString u_point_radius = {9956381140398668479ull};
+/* "geom:point_position" */
+ccl_device_constant DeviceString u_point_position = {15684484280742966916ull};
+/* "geom:point_random" */
+ccl_device_constant DeviceString u_point_random = {5632627207092325544ull};
+/* "geom:normal_map_normal" */
+ccl_device_constant DeviceString u_normal_map_normal = {10718948685686827073};
+/* "path:ray_length" */
+ccl_device_constant DeviceString u_path_ray_length = {16391985802412544524ull};
+/* "path:ray_depth" */
+ccl_device_constant DeviceString u_path_ray_depth = {16643933224879500399ull};
+/* "path:diffuse_depth" */
+ccl_device_constant DeviceString u_path_diffuse_depth = {13191651286699118408ull};
+/* "path:glossy_depth" */
+ccl_device_constant DeviceString u_path_glossy_depth = {15717768399057252940ull};
+/* "path:transparent_depth" */
+ccl_device_constant DeviceString u_path_transparent_depth = {7821650266475578543ull};
+/* "path:transmission_depth" */
+ccl_device_constant DeviceString u_path_transmission_depth = {15113408892323917624ull};
+
+} // namespace DeviceStrings
+
+/* Closure */
+
+ccl_device_extern ccl_private OSLClosure *osl_mul_closure_color(ccl_private ShaderGlobals *sg,
+ ccl_private OSLClosure *a,
+ ccl_private const float3 *weight)
+{
+ if (*weight == zero_float3() || !a) {
+ return nullptr;
+ }
+ else if (*weight == one_float3()) {
+ return a;
+ }
+
+ ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+ ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+ /* Align pointer to closure struct requirement */
+ closure_pool = reinterpret_cast<uint8_t *>(
+ (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureMul) - 1) &
+ (-alignof(OSLClosureMul)));
+ sd->osl_closure_pool = closure_pool + sizeof(OSLClosureMul);
+
+ ccl_private OSLClosureMul *const closure = reinterpret_cast<ccl_private OSLClosureMul *>(
+ closure_pool);
+ closure->id = OSL_CLOSURE_MUL_ID;
+ closure->weight = *weight;
+ closure->closure = a;
+
+ return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_mul_closure_float(ccl_private ShaderGlobals *sg,
+ ccl_private OSLClosure *a,
+ float weight)
+{
+ if (weight == 0.0f || !a) {
+ return nullptr;
+ }
+ else if (weight == 1.0f) {
+ return a;
+ }
+
+ ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+ uint8_t *closure_pool = sd->osl_closure_pool;
+ /* Align pointer to closure struct requirement */
+ closure_pool = reinterpret_cast<uint8_t *>(
+ (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureMul) - 1) &
+ (-alignof(OSLClosureMul)));
+ sd->osl_closure_pool = closure_pool + sizeof(OSLClosureMul);
+
+ ccl_private OSLClosureMul *const closure = reinterpret_cast<ccl_private OSLClosureMul *>(
+ closure_pool);
+ closure->id = OSL_CLOSURE_MUL_ID;
+ closure->weight = make_float3(weight, weight, weight);
+ closure->closure = a;
+
+ return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_add_closure_closure(ccl_private ShaderGlobals *sg,
+ ccl_private OSLClosure *a,
+ ccl_private OSLClosure *b)
+{
+ if (!a) {
+ return b;
+ }
+ if (!b) {
+ return a;
+ }
+
+ ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+ ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+ /* Align pointer to closure struct requirement */
+ closure_pool = reinterpret_cast<uint8_t *>(
+ (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureAdd) - 1) &
+ (-alignof(OSLClosureAdd)));
+ sd->osl_closure_pool = closure_pool + sizeof(OSLClosureAdd);
+
+ ccl_private OSLClosureAdd *const closure = reinterpret_cast<ccl_private OSLClosureAdd *>(
+ closure_pool);
+ closure->id = OSL_CLOSURE_ADD_ID;
+ closure->closureA = a;
+ closure->closureB = b;
+
+ return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_allocate_closure_component(
+ ccl_private ShaderGlobals *sg, int id, int size)
+{
+ ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+ ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+ /* Align pointer to closure struct requirement */
+ closure_pool = reinterpret_cast<uint8_t *>(
+ (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureComponent) - 1) &
+ (-alignof(OSLClosureComponent)));
+ sd->osl_closure_pool = closure_pool + sizeof(OSLClosureComponent) + size;
+
+ ccl_private OSLClosureComponent *const closure =
+ reinterpret_cast<ccl_private OSLClosureComponent *>(closure_pool);
+ closure->id = static_cast<OSLClosureType>(id);
+ closure->weight = one_float3();
+
+ return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_allocate_weighted_closure_component(
+ ccl_private ShaderGlobals *sg, int id, int size, ccl_private const float3 *weight)
+{
+ ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+ ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+ /* Align pointer to closure struct requirement */
+ closure_pool = reinterpret_cast<uint8_t *>(
+ (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureComponent) - 1) &
+ (-alignof(OSLClosureComponent)));
+ sd->osl_closure_pool = closure_pool + sizeof(OSLClosureComponent) + size;
+
+ ccl_private OSLClosureComponent *const closure =
+ reinterpret_cast<ccl_private OSLClosureComponent *>(closure_pool);
+ closure->id = static_cast<OSLClosureType>(id);
+ closure->weight = *weight;
+
+ return closure;
+}
+
+/* Utilities */
+
+#include "kernel/svm/math_util.h"
+#include "kernel/util/color.h"
+
+ccl_device_extern void osl_error(ccl_private ShaderGlobals *sg, const char *format, void *args)
+{
+}
+
+ccl_device_extern void osl_printf(ccl_private ShaderGlobals *sg, const char *format, void *args)
+{
+}
+
+ccl_device_extern void osl_warning(ccl_private ShaderGlobals *sg, const char *format, void *args)
+{
+}
+
+ccl_device_extern uint osl_range_check(int indexvalue,
+ int length,
+ DeviceString symname,
+ ccl_private ShaderGlobals *sg,
+ DeviceString sourcefile,
+ int sourceline,
+ DeviceString groupname,
+ int layer,
+ DeviceString layername,
+ DeviceString shadername)
+{
+ const int result = indexvalue < 0 ? 0 : indexvalue >= length ? length - 1 : indexvalue;
+#if 0
+ if (result != indexvalue) {
+ printf("Index [%d] out of range\n", indexvalue);
+ }
+#endif
+ return result;
+}
+
+ccl_device_extern uint osl_range_check_err(int indexvalue,
+ int length,
+ DeviceString symname,
+ ccl_private ShaderGlobals *sg,
+ DeviceString sourcefile,
+ int sourceline,
+ DeviceString groupname,
+ int layer,
+ DeviceString layername,
+ DeviceString shadername)
+{
+ return osl_range_check(indexvalue,
+ length,
+ symname,
+ sg,
+ sourcefile,
+ sourceline,
+ groupname,
+ layer,
+ layername,
+ shadername);
+}
+
+/* Color Utilities */
+
+ccl_device_extern void osl_blackbody_vf(ccl_private ShaderGlobals *sg,
+ ccl_private float3 *result,
+ float temperature)
+{
+ float3 color_rgb = rec709_to_rgb(nullptr, svm_math_blackbody_color_rec709(temperature));
+ color_rgb = max(color_rgb, zero_float3());
+ *result = color_rgb;
+}
+
+#if 0
+ccl_device_extern void osl_wavelength_color_vf(ccl_private ShaderGlobals *sg,
+ ccl_private float3 *result,
+ float wavelength)
+{
+}
+#endif
+
+ccl_device_extern void osl_luminance_fv(ccl_private ShaderGlobals *sg,
+ ccl_private float *result,
+ ccl_private float3 *color)
+{
+ *result = linear_rgb_to_gray(nullptr, *color);
+}
+
+ccl_device_extern void osl_luminance_dfdv(ccl_private ShaderGlobals *sg,
+ ccl_private float *result,
+ ccl_private float3 *color)
+{
+ for (int i = 0; i < 3; ++i) {
+ osl_luminance_fv(sg, result + i, color + i);
+ }
+}
+
+ccl_device_extern void osl_prepend_color_from(ccl_private ShaderGlobals *sg,
+ ccl_private float3 *res,
+ DeviceString from)
+{
+ if (from == DeviceStrings::u_hsv) {
+ *res = hsv_to_rgb(*res);
+ }
+ else if (from == DeviceStrings::u_hsl) {
+ *res = hsl_to_rgb(*res);
+ }
+ else if (from == DeviceStrings::u_xyz) {
+ *res = xyz_to_rgb(nullptr, *res);
+ }
+ else if (from == DeviceStrings::u_xyy) {
+ *res = xyz_to_rgb(nullptr, xyY_to_xyz(res->x, res->y, res->z));
+ }
+}
+
+ccl_device_extern bool osl_transformc(ccl_private ShaderGlobals *sg,
+ ccl_private float3 *c_in,
+ int c_in_derivs,
+ ccl_private float3 *c_out,
+ int c_out_derivs,
+ DeviceString from,
+ DeviceString to)
+{
+ if (!c_out_derivs) {
+ c_in_derivs = false;
+ }
+ else if (!c_in_derivs) {
+ c_out[1] = zero_float3();
+ c_out[2] = zero_float3();
+ }
+
+ float3 rgb;
+
+ for (int i = 0; i < (c_in_derivs ? 3 : 1); ++i) {
+ if (from == DeviceStrings::u_hsv) {
+ rgb = hsv_to_rgb(c_in[i]);
+ }
+ else if (from == DeviceStrings::u_hsl) {
+ rgb = hsl_to_rgb(c_in[i]);
+ }
+ else if (from == DeviceStrings::u_xyz) {
+ rgb = xyz_to_rgb(nullptr, c_in[i]);
+ }
+ else if (from == DeviceStrings::u_xyy) {
+ rgb = xyz_to_rgb(nullptr, xyY_to_xyz(c_in[i].x, c_in[i].y, c_in[i].z));
+ }
+ else if (from == DeviceStrings::u_srgb) {
+ rgb = color_srgb_to_linear_v3(c_in[i]);
+ }
+ else {
+ rgb = c_in[i];
+ }
+
+ if (to == DeviceStrings::u_hsv) {
+ c_out[i] = rgb_to_hsv(rgb);
+ }
+ else if (to == DeviceStrings::u_hsl) {
+ c_out[i] = rgb_to_hsl(rgb);
+ }
+#if 0
+ else if (to == DeviceStrings::u_xyz) {
+ c_out[i] = rgb_to_xyz(nullptr, rgb);
+ }
+ else if (to == DeviceStrings::u_xyy) {
+ c_out[i] = xyz_to_xyY(rgb_to_xyz(nullptr, rgb));
+ }
+#endif
+ else if (to == DeviceStrings::u_srgb) {
+ c_out[i] = color_linear_to_srgb_v3(rgb);
+ }
+ else {
+ c_out[i] = rgb;
+ }
+ }
+
+ return true;
+}
+
+/* Matrix Utilities */
+
+#include "kernel/geom/object.h"
+#include "util/transform.h"
+
+ccl_device_forceinline void copy_matrix(ccl_private float *res, const Transform &tfm)
+{
+ res[0] = tfm.x.x;
+ res[1] = tfm.y.x;
+ res[2] = tfm.z.x;
+ res[3] = 0.0f;
+ res[4] = tfm.x.y;
+ res[5] = tfm.y.y;
+ res[6] = tfm.z.y;
+ res[7] = 0.0f;
+ res[8] = tfm.x.z;
+ res[9] = tfm.y.z;
+ res[10] = tfm.z.z;
+ res[11] = 0.0f;
+ res[12] = tfm.x.w;
+ res[13] = tfm.y.w;
+ res[14] = tfm.z.w;
+ res[15] = 1.0f;
+}
+ccl_device_forceinline void copy_matrix(ccl_private float *res, const ProjectionTransform &tfm)
+{
+ res[0] = tfm.x.x;
+ res[1] = tfm.y.x;
+ res[2] = tfm.z.x;
+ res[3] = tfm.w.x;
+ res[4] = tfm.x.y;
+ res[5] = tfm.y.y;
+ res[6] = tfm.z.y;
+ res[7] = tfm.w.y;
+ res[8] = tfm.x.z;
+ res[9] = tfm.y.z;
+ res[10] = tfm.z.z;
+ res[11] = tfm.w.z;
+ res[12] = tfm.x.w;
+ res[13] = tfm.y.w;
+ res[14] = tfm.z.w;
+ res[15] = tfm.w.w;
+}
+ccl_device_forceinline void copy_identity_matrix(ccl_private float *res, float value = 1.0f)
+{
+ res[0] = value;
+ res[1] = 0.0f;
+ res[2] = 0.0f;
+ res[3] = 0.0f;
+ res[4] = 0.0f;
+ res[5] = value;
+ res[6] = 0.0f;
+ res[7] = 0.0f;
+ res[8] = 0.0f;
+ res[9] = 0.0f;
+ res[10] = value;
+ res[11] = 0.0f;
+ res[12] = 0.0f;
+ res[13] = 0.0f;
+ res[14] = 0.0f;
+ res[15] = value;
+}
+ccl_device_forceinline Transform convert_transform(ccl_private const float *m)
+{
+ return make_transform(
+ m[0], m[4], m[8], m[12], m[1], m[5], m[9], m[13], m[2], m[6], m[10], m[14]);
+}
+
+ccl_device_extern void osl_mul_mmm(ccl_private float *res,
+ ccl_private const float *a,
+ ccl_private const float *b)
+{
+ const Transform tfm_a = convert_transform(a);
+ const Transform tfm_b = convert_transform(b);
+ copy_matrix(res, tfm_a * tfm_b);
+}
+
+ccl_device_extern void osl_mul_mmf(ccl_private float *res, ccl_private const float *a, float b)
+{
+ for (int i = 0; i < 16; ++i) {
+ res[i] = a[i] * b;
+ }
+}
+
+ccl_device_extern void osl_div_mmm(ccl_private float *res,
+ ccl_private const float *a,
+ ccl_private const float *b)
+{
+ const Transform tfm_a = convert_transform(a);
+ const Transform tfm_b = convert_transform(b);
+ copy_matrix(res, tfm_a * transform_inverse(tfm_b));
+}
+
+ccl_device_extern void osl_div_mmf(ccl_private float *res, ccl_private const float *a, float b)
+{
+ for (int i = 0; i < 16; ++i) {
+ res[i] = a[i] / b;
+ }
+}
+
+ccl_device_extern void osl_div_mfm(ccl_private float *res, float a, ccl_private const float *b)
+{
+ const Transform tfm_b = convert_transform(b);
+ copy_matrix(res, transform_inverse(tfm_b));
+ for (int i = 0; i < 16; ++i) {
+ res[i] *= a;
+ }
+}
+
+ccl_device_extern void osl_div_m_ff(ccl_private float *res, float a, float b)
+{
+ float f = (b == 0) ? 0.0f : (a / b);
+ copy_identity_matrix(res, f);
+}
+
+ccl_device_extern void osl_transform_vmv(ccl_private float3 *res,
+ ccl_private const float *m,
+ ccl_private const float3 *v)
+{
+ const Transform tfm_m = convert_transform(m);
+ *res = transform_point(&tfm_m, *v);
+}
+
+ccl_device_extern void osl_transform_dvmdv(ccl_private float3 *res,
+ ccl_private const float *m,
+ ccl_private const float3 *v)
+{
+ for (int i = 0; i < 3; ++i) {
+ const Transform tfm_m = convert_transform(m + i * 16);
+ res[i] = transform_point(&tfm_m, v[i]);
+ }
+}
+
+ccl_device_extern void osl_transformv_vmv(ccl_private float3 *res,
+ ccl_private const float *m,
+ ccl_private const float3 *v)
+{
+ const Transform tfm_m = convert_transform(m);
+ *res = transform_direction(&tfm_m, *v);
+}
+
+ccl_device_extern void osl_transformv_dvmdv(ccl_private float3 *res,
+ ccl_private const float *m,
+ ccl_private const float3 *v)
+{
+ for (int i = 0; i < 3; ++i) {
+ const Transform tfm_m = convert_transform(m + i * 16);
+ res[i] = transform_direction(&tfm_m, v[i]);
+ }
+}
+
+ccl_device_extern void osl_transformn_vmv(ccl_private float3 *res,
+ ccl_private const float *m,
+ ccl_private const float3 *v)
+{
+ const Transform tfm_m = convert_transform(m);
+ *res = transform_direction(&tfm_m, *v);
+}
+
+ccl_device_extern void osl_transformn_dvmdv(ccl_private float3 *res,
+ ccl_private const float *m,
+ ccl_private const float3 *v)
+{
+ for (int i = 0; i < 3; ++i) {
+ const Transform tfm_m = convert_transform(m + i * 16);
+ res[i] = transform_direction(&tfm_m, v[i]);
+ }
+}
+
+ccl_device_extern bool osl_get_matrix(ccl_private ShaderGlobals *sg,
+ ccl_private float *res,
+ DeviceString from)
+{
+ if (from == DeviceStrings::u_common || from == DeviceStrings::u_world) {
+ copy_identity_matrix(res);
+ return true;
+ }
+ if (from == DeviceStrings::u_shader || from == DeviceStrings::u_object) {
+ KernelGlobals kg = nullptr;
+ ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+ int object = sd->object;
+
+ if (object != OBJECT_NONE) {
+ const Transform tfm = object_get_transform(kg, sd);
+ copy_matrix(res, tfm);
+ return true;
+ }
+ else if (sd->type == PRIMITIVE_LAMP) {
+ const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+ copy_matrix(res, tfm);
+ return true;
+ }
+ }
+ else if (from == DeviceStrings::u_ndc) {
+ copy_matrix(res, kernel_data.cam.ndctoworld);
+ return true;
+ }
+ else if (from == DeviceStrings::u_raster) {
+ copy_matrix(res, kernel_data.cam.rastertoworld);
+ return true;
+ }
+ else if (from == DeviceStrings::u_screen) {
+ copy_matrix(res, kernel_data.cam.screentoworld);
+ return true;
+ }
+ else if (from == DeviceStrings::u_camera) {
+ copy_matrix(res, kernel_data.cam.cameratoworld);
+ return true;
+ }
+
+ return false;
+}
+
+ccl_device_extern bool osl_get_inverse_matrix(ccl_private ShaderGlobals *sg,
+ ccl_private float *res,
+ DeviceString to)
+{
+ if (to == DeviceStrings::u_common || to == DeviceStrings::u_world) {
+ copy_identity_matrix(res);
+ return true;
+ }
+ if (to == DeviceStrings::u_shader || to == DeviceStrings::u_object) {
+ KernelGlobals kg = nullptr;
+ ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+ int object = sd->object;
+
+ if (object != OBJECT_NONE) {
+ const Transform itfm = object_get_inverse_transform(kg, sd);
+ copy_matrix(res, itfm);
+ return true;
+ }
+ else if (sd->type == PRIMITIVE_LAMP) {
+ const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+ copy_matrix(res, itfm);
+ return true;
+ }
+ }
+ else if (to == DeviceStrings::u_ndc) {
+ copy_matrix(res, kernel_data.cam.worldtondc);
+ return true;
+ }
+ else if (to == DeviceStrings::u_raster) {
+ copy_matrix(res, kernel_data.cam.worldtoraster);
+ return true;
+ }
+ else if (to == DeviceStrings::u_screen) {
+ copy_matrix(res, kernel_data.cam.worldtoscreen);
+ return true;
+ }
+ else if (to == DeviceStrings::u_camera) {
+ copy_matrix(res, kernel_data.cam.worldtocamera);
+ return true;
+ }
+
+ return false;
+}
+
+ccl_device_extern bool osl_prepend_matrix_from(ccl_private ShaderGlobals *sg,
+ ccl_private float *res,
+ DeviceString from)
+{
+ float m_from[16];
+ if (osl_get_matrix(sg, m_from, from)) {
+ osl_mul_mmm(res, m_from, res);
+ return true;
+ }
+
+ return false;
+}
+
+ccl_device_extern bool osl_get_from_to_matrix(ccl_private ShaderGlobals *sg,
+ ccl_private float *res,
+ DeviceString from,
+ DeviceString to)
+{
+ float m_from[16], m_to[16];
+ if (osl_get_matrix(sg, m_from, from) && osl_get_inverse_matrix(sg, m_to, to)) {
+ osl_mul_mmm(res, m_from, m_to);
+ return true;
+ }
+
+ return false;
+}
+
+ccl_device_extern bool osl_transform_triple(ccl_private ShaderGlobals *sg,
+ ccl_private float3 *p_in,
+ int p_in_derivs,
+ ccl_private float3 *p_out,
+ int p_out_derivs,
+ DeviceString from,
+ DeviceString to,
+ int vectype)
+{
+ if (!p_out_derivs) {
+ p_in_derivs = false;
+ }
+ else if (!p_in_derivs) {
+ p_out[1] = zero_float3();
+ p_out[2] = zero_float3();
+ }
+
+ bool res;
+ float m[16];
+
+ if (from == DeviceStrings::u_common) {
+ res = osl_get_inverse_matrix(sg, m, to);
+ }
+ else if (to == DeviceStrings::u_common) {
+ res = osl_get_matrix(sg, m, from);
+ }
+ else {
+ res = osl_get_from_to_matrix(sg, m, from, to);
+ }
+
+ if (res) {
+ if (vectype == 2 /* TypeDesc::POINT */) {
+ if (p_in_derivs)
+ osl_transform_dvmdv(p_out, m, p_in);
+ else
+ osl_transform_vmv(p_out, m, p_in);
+ }
+ else if (vectype == 3 /* TypeDesc::VECTOR */) {
+ if (p_in_derivs)
+ osl_transformv_dvmdv(p_out, m, p_in);
+ else
+ osl_transformv_vmv(p_out, m, p_in);
+ }
+ else if (vectype == 4 /* TypeDesc::NORMAL */) {
+ if (p_in_derivs)
+ osl_transformn_dvmdv(p_out, m, p_in);
+ else
+ osl_transformn_vmv(p_out, m, p_in);
+ }
+ else {
+ res = false;
+ }
+ }
+ else {
+ p_out[0] = p_in[0];
+ if (p_in_derivs) {
+ p_out[1] = p_in[1];
+ p_out[2] = p_in[2];
+ }
+ }
+
+ return res;
+}
+
+ccl_device_extern bool osl_transform_triple_nonlinear(ccl_private ShaderGlobals *sg,
+ ccl_private float3 *p_in,
+ int p_in_derivs,
+ ccl_private float3 *p_out,
+ int p_out_derivs,
+ DeviceString from,
+ DeviceString to,
+ int vectype)
+{
+ return osl_transform_triple(sg, p_in, p_in_derivs, p_out, p_out_derivs, from, to, vectype);
+}
+
+ccl_device_extern void osl_transpose_mm(ccl_private float *res, ccl_private const float *m)
+{
+ copy_matrix(res, *reinterpret_cast<ccl_private const ProjectionTransform *>(m));
+}
+
+#if 0
+ccl_device_extern float osl_determinant_fm(ccl_private const float *m)
+{
+}
+#endif
+
+/* Attributes */
+
+#include "kernel/geom/geom.h"
+
+typedef long long TypeDesc;
+
+ccl_device_inline bool set_attribute_float(ccl_private float fval[3],
+ TypeDesc type,
+ bool derivatives,
+ ccl_private void *val)
+{
+ const unsigned char type_basetype = type & 0xF;
+ const unsigned char type_aggregate = (type >> 8) & 0xF;
+ const int type_arraylen = type >> 32;
+
+ if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+ if ((type_aggregate == 2 /* TypeDesc::VEC2 */) ||
+ (type_aggregate == 1 && type_arraylen == 2)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i * 2 + 0] = fval[i];
+ static_cast<ccl_private float *>(val)[i * 2 + 1] = fval[i];
+ }
+ return true;
+ }
+ if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+ (type_aggregate == 1 && type_arraylen == 3)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i];
+ static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i];
+ static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i];
+ }
+ return true;
+ }
+ if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+ (type_aggregate == 1 && type_arraylen == 4)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i];
+ static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i];
+ static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i];
+ static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f;
+ }
+ return true;
+ }
+ if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i] = fval[i];
+ }
+ return true;
+ }
+ }
+
+ return false;
+}
+ccl_device_inline bool set_attribute_float(float f,
+ TypeDesc type,
+ bool derivatives,
+ ccl_private void *val)
+{
+ float fv[3];
+
+ fv[0] = f;
+ fv[1] = 0.0f;
+ fv[2] = 0.0f;
+
+ return set_attribute_float(fv, type, derivatives, val);
+}
+ccl_device_inline bool set_attribute_float2(ccl_private float2 fval[3],
+ TypeDesc type,
+ bool derivatives,
+ ccl_private void *val)
+{
+ const unsigned char type_basetype = type & 0xF;
+ const unsigned char type_aggregate = (type >> 8) & 0xF;
+ const int type_arraylen = type >> 32;
+
+ if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+ if ((type_aggregate == 2 /* TypeDesc::VEC2 */) ||
+ (type_aggregate == 1 && type_arraylen == 2)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i * 2 + 0] = fval[i].x;
+ static_cast<ccl_private float *>(val)[i * 2 + 1] = fval[i].y;
+ }
+ return true;
+ }
+ if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+ (type_aggregate == 1 && type_arraylen == 3)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x;
+ static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y;
+ static_cast<ccl_private float *>(val)[i * 3 + 2] = 0.0f;
+ }
+ return true;
+ }
+ if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+ (type_aggregate == 1 && type_arraylen == 4)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x;
+ static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y;
+ static_cast<ccl_private float *>(val)[i * 4 + 2] = 0.0f;
+ static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f;
+ }
+ return true;
+ }
+ if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i] = fval[i].x;
+ }
+ return true;
+ }
+ }
+
+ return false;
+}
+ccl_device_inline bool set_attribute_float3(ccl_private float3 fval[3],
+ TypeDesc type,
+ bool derivatives,
+ ccl_private void *val)
+{
+ const unsigned char type_basetype = type & 0xF;
+ const unsigned char type_aggregate = (type >> 8) & 0xF;
+ const int type_arraylen = type >> 32;
+
+ if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+ if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+ (type_aggregate == 1 && type_arraylen == 3)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x;
+ static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y;
+ static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i].z;
+ }
+ return true;
+ }
+ if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+ (type_aggregate == 1 && type_arraylen == 4)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x;
+ static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y;
+ static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i].z;
+ static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f;
+ }
+ return true;
+ }
+ if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i] = average(fval[i]);
+ }
+ return true;
+ }
+ }
+
+ return false;
+}
+ccl_device_inline bool set_attribute_float3(float3 f,
+ TypeDesc type,
+ bool derivatives,
+ ccl_private void *val)
+{
+ float3 fv[3];
+
+ fv[0] = f;
+ fv[1] = make_float3(0.0f, 0.0f, 0.0f);
+ fv[2] = make_float3(0.0f, 0.0f, 0.0f);
+
+ return set_attribute_float3(fv, type, derivatives, val);
+}
+ccl_device_inline bool set_attribute_float4(ccl_private float4 fval[3],
+ TypeDesc type,
+ bool derivatives,
+ ccl_private void *val)
+{
+ const unsigned char type_basetype = type & 0xF;
+ const unsigned char type_aggregate = (type >> 8) & 0xF;
+ const int type_arraylen = type >> 32;
+
+ if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+ if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+ (type_aggregate == 1 && type_arraylen == 3)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x;
+ static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y;
+ static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i].z;
+ }
+ return true;
+ }
+ if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+ (type_aggregate == 1 && type_arraylen == 4)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x;
+ static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y;
+ static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i].z;
+ static_cast<ccl_private float *>(val)[i * 4 + 3] = fval[i].w;
+ }
+ return true;
+ }
+ if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+ for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+ static_cast<ccl_private float *>(val)[i] = average(float4_to_float3(fval[i]));
+ }
+ return true;
+ }
+ }
+
+ return false;
+}
+ccl_device_inline bool set_attribute_matrix(ccl_private const Transform &tfm,
+ TypeDesc type,
+ ccl_private void *val)
+{
+ const unsigned char type_basetype = type & 0xF;
+ const unsigned char type_aggregate = (type >> 8) & 0xF;
+
+ if (type_basetype == 11 /* TypeDesc::FLOAT */ && type_aggregate == 16 /* TypeDesc::MATRIX44 */) {
+ copy_matrix(static_cast<ccl_private float *>(val), tfm);
+ return true;
+ }
+
+ return false;
+}
+
+ccl_device_inline bool get_background_attribute(KernelGlobals kg,
+ ccl_private ShaderData *sd,
+ DeviceString name,
+ TypeDesc type,
+ bool derivatives,
+ ccl_private void *val)
+{
+ if (name == DeviceStrings::u_path_ray_length) {
+ /* Ray Length */
+ float f = sd->ray_length;
+ return set_attribute_float(f, type, derivatives, val);
+ }
+
+ return false;
+}
+
+ccl_device_inline bool get_object_attribute(KernelGlobals kg,
+ ccl_private ShaderData *sd,
+ const AttributeDescriptor &desc,
+ TypeDesc type,
+ bool derivatives,
+ ccl_private void *val)
+{
+ if (desc.type == NODE_ATTR_FLOAT) {
+ float fval[3];
+#ifdef __VOLUME__
+ if (primitive_is_volume_attribute(sd, desc))
+ fval[0] = primitive_volume_attribute_float(kg, sd, desc);
+ else
+#endif
+ fval[0] = primitive_surface_attribute_float(
+ kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+ return set_attribute_float(fval, type, derivatives, val);
+ }
+ else if (desc.type == NODE_ATTR_FLOAT2) {
+ float2 fval[3];
+#ifdef __VOLUME__
+ if (primitive_is_volume_attribute(sd, desc))
+ return false;
+ else
+#endif
+ fval[0] = primitive_surface_attribute_float2(
+ kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+ return set_attribute_float2(fval, type, derivatives, val);
+ }
+ else if (desc.type == NODE_ATTR_FLOAT3) {
+ float3 fval[3];
+#ifdef __VOLUME__
+ if (primitive_is_volume_attribute(sd, desc))
+ fval[0] = primitive_volume_attribute_float3(kg, sd, desc);
+ else
+#endif
+ fval[0] = primitive_surface_attribute_float3(
+ kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+ return set_attribute_float3(fval, type, derivatives, val);
+ }
+ else if (desc.type == NODE_ATTR_FLOAT4 || desc.type == NODE_ATTR_RGBA) {
+ float4 fval[3];
+#ifdef __VOLUME__
+ if (primitive_is_volume_attribute(sd, desc))
+ fval[0] = primitive_volume_attribute_float4(kg, sd, desc);
+ else
+#endif
+ fval[0] = primitive_surface_attribute_float4(
+ kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+ return set_attribute_float4(fval, type, derivatives, val);
+ }
+ else if (desc.type == NODE_ATTR_MATRIX) {
+ Transform tfm = primitive_attribute_matrix(kg, desc);
+ return set_attribute_matrix(tfm, type, val);
+ }
+
+ return false;
+}
+
+ccl_device_inline bool get_object_standard_attribute(KernelGlobals kg,
+ ccl_private ShaderData *sd,
+ DeviceString name,
+ TypeDesc type,
+ bool derivatives,
+ ccl_private void *val)
+{
+ /* Object attributes */
+ if (name == DeviceStrings::u_object_location) {
+ float3 f = object_location(kg, sd);
+ return set_attribute_float3(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_object_color) {
+ float3 f = object_color(kg, sd->object);
+ return set_attribute_float3(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_object_alpha) {
+ float f = object_alpha(kg, sd->object);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_object_index) {
+ float f = object_pass_id(kg, sd->object);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_geom_dupli_generated) {
+ float3 f = object_dupli_generated(kg, sd->object);
+ return set_attribute_float3(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_geom_dupli_uv) {
+ float3 f = object_dupli_uv(kg, sd->object);
+ return set_attribute_float3(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_material_index) {
+ float f = shader_pass_id(kg, sd);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_object_random) {
+ float f = object_random_number(kg, sd->object);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+
+ /* Particle attributes */
+ else if (name == DeviceStrings::u_particle_index) {
+ int particle_id = object_particle_id(kg, sd->object);
+ float f = particle_index(kg, particle_id);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_particle_random) {
+ int particle_id = object_particle_id(kg, sd->object);
+ float f = hash_uint2_to_float(particle_index(kg, particle_id), 0);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+
+ else if (name == DeviceStrings::u_particle_age) {
+ int particle_id = object_particle_id(kg, sd->object);
+ float f = particle_age(kg, particle_id);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_particle_lifetime) {
+ int particle_id = object_particle_id(kg, sd->object);
+ float f = particle_lifetime(kg, particle_id);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_particle_location) {
+ int particle_id = object_particle_id(kg, sd->object);
+ float3 f = particle_location(kg, particle_id);
+ return set_attribute_float3(f, type, derivatives, val);
+ }
+#if 0 /* unsupported */
+ else if (name == DeviceStrings::u_particle_rotation) {
+ int particle_id = object_particle_id(kg, sd->object);
+ float4 f = particle_rotation(kg, particle_id);
+ return set_attribute_float4(f, type, derivatives, val);
+ }
+#endif
+ else if (name == DeviceStrings::u_particle_size) {
+ int particle_id = object_particle_id(kg, sd->object);
+ float f = particle_size(kg, particle_id);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_particle_velocity) {
+ int particle_id = object_particle_id(kg, sd->object);
+ float3 f = particle_velocity(kg, particle_id);
+ return set_attribute_float3(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_particle_angular_velocity) {
+ int particle_id = object_particle_id(kg, sd->object);
+ float3 f = particle_angular_velocity(kg, particle_id);
+ return set_attribute_float3(f, type, derivatives, val);
+ }
+
+ /* Geometry attributes */
+#if 0 /* TODO */
+ else if (name == DeviceStrings::u_geom_numpolyvertices) {
+ return false;
+ }
+ else if (name == DeviceStrings::u_geom_trianglevertices ||
+ name == DeviceStrings::u_geom_polyvertices) {
+ return false;
+ }
+ else if (name == DeviceStrings::u_geom_name) {
+ return false;
+ }
+#endif
+ else if (name == DeviceStrings::u_is_smooth) {
+ float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+
+#ifdef __HAIR__
+ /* Hair attributes */
+ else if (name == DeviceStrings::u_is_curve) {
+ float f = (sd->type & PRIMITIVE_CURVE) != 0;
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_curve_thickness) {
+ float f = curve_thickness(kg, sd);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_curve_tangent_normal) {
+ float3 f = curve_tangent_normal(kg, sd);
+ return set_attribute_float3(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_curve_random) {
+ float f = curve_random(kg, sd);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+#endif
+
+#ifdef __POINTCLOUD__
+ /* Point attributes */
+ else if (name == DeviceStrings::u_is_point) {
+ float f = (sd->type & PRIMITIVE_POINT) != 0;
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_point_radius) {
+ float f = point_radius(kg, sd);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_point_position) {
+ float3 f = point_position(kg, sd);
+ return set_attribute_float3(f, type, derivatives, val);
+ }
+ else if (name == DeviceStrings::u_point_random) {
+ float f = point_random(kg, sd);
+ return set_attribute_float(f, type, derivatives, val);
+ }
+#endif
+
+ else if (name == DeviceStrings::u_normal_map_normal) {
+ if (sd->type & PRIMITIVE_TRIANGLE) {
+ float3 f = triangle_smooth_normal_unnormalized(kg, sd, sd->Ng, sd->prim, sd->u, sd->v);
+ return set_attribute_float3(f, type, derivatives, val);
+ }
+ else {
+ return false;
+ }
+ }
+
+ return get_background_attribute(kg, sd, name, type, derivatives, val);
+}
+
+ccl_device_extern bool osl_get_attribute(ccl_private ShaderGlobals *sg,
+ int derivatives,
+ DeviceString object_name,
+ DeviceString name,
+ int array_lookup,
+ int index,
+ TypeDesc type,
+ ccl_private void *res)
+{
+ KernelGlobals kg = nullptr;
+ ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+ int object;
+
+ if (object_name != DeviceStrings::_emptystring_) {
+ /* TODO: Get object index from name */
+ return false;
+ }
+ else {
+ object = sd->object;
+ }
+
+ const uint64_t id = name.hash();
+
+ const AttributeDescriptor desc = find_attribute(kg, object, sd->prim, sd->type, id);
+ if (desc.offset != ATTR_STD_NOT_FOUND) {
+ return get_object_attribute(kg, sd, desc, type, derivatives, res);
+ }
+ else {
+ return get_object_standard_attribute(kg, sd, name, type, derivatives, res);
+ }
+}
+
+#if 0
+ccl_device_extern bool osl_bind_interpolated_param(ccl_private ShaderGlobals *sg,
+ DeviceString name,
+ long long type,
+ int userdata_has_derivs,
+ ccl_private void *userdata_data,
+ int symbol_has_derivs,
+ ccl_private void *symbol_data,
+ int symbol_data_size,
+ ccl_private void *userdata_initialized,
+ int userdata_index)
+{
+ return false;
+}
+#endif
+
+/* Noise */
+
+#include "kernel/svm/noise.h"
+#include "util/hash.h"
+
+ccl_device_extern uint osl_hash_ii(int x)
+{
+ return hash_uint(x);
+}
+
+ccl_device_extern uint osl_hash_if(float x)
+{
+ return hash_uint(__float_as_uint(x));
+}
+
+ccl_device_extern uint osl_hash_iff(float x, float y)
+{
+ return hash_uint2(__float_as_uint(x), __float_as_uint(y));
+}
+
+ccl_device_extern uint osl_hash_iv(ccl_private const float3 *v)
+{
+ return hash_uint3(__float_as_uint(v->x), __float_as_uint(v->y), __float_as_uint(v->z));
+}
+
+ccl_device_extern uint osl_hash_ivf(ccl_private const float3 *v, float w)
+{
+ return hash_uint4(
+ __float_as_uint(v->x), __float_as_uint(v->y), __float_as_uint(v->z), __float_as_uint(w));
+}
+
+ccl_device_extern OSLNoiseOptions *osl_get_noise_options(ccl_private ShaderGlobals *sg)
+{
+ return nullptr;
+}
+
+ccl_device_extern void osl_noiseparams_set_anisotropic(ccl_private OSLNoiseOptions *opt,
+ int anisotropic)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_do_filter(ccl_private OSLNoiseOptions *opt,
+ int do_filter)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_direction(ccl_private OSLNoiseOptions *opt,
+ float3 *direction)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_bandwidth(ccl_private OSLNoiseOptions *opt,
+ float bandwidth)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_impulses(ccl_private OSLNoiseOptions *opt,
+ float impulses)
+{
+}
+
+#define OSL_NOISE_IMPL(name, op) \
+ ccl_device_extern float name##_ff(float x) \
+ { \
+ return op##_1d(x); \
+ } \
+ ccl_device_extern float name##_fff(float x, float y) \
+ { \
+ return op##_2d(make_float2(x, y)); \
+ } \
+ ccl_device_extern float name##_fv(ccl_private const float3 *v) \
+ { \
+ return op##_3d(*v); \
+ } \
+ ccl_device_extern float name##_fvf(ccl_private const float3 *v, float w) \
+ { \
+ return op##_4d(make_float4(v->x, v->y, v->z, w)); \
+ } \
+ ccl_device_extern void name##_vf(ccl_private float3 *res, float x) \
+ { \
+ /* TODO: This is not correct. Really need to change the hash function inside the noise \
+ * function to spit out a vector instead of a scalar. */ \
+ const float n = name##_ff(x); \
+ res->x = n; \
+ res->y = n; \
+ res->z = n; \
+ } \
+ ccl_device_extern void name##_vff(ccl_private float3 *res, float x, float y) \
+ { \
+ const float n = name##_fff(x, y); \
+ res->x = n; \
+ res->y = n; \
+ res->z = n; \
+ } \
+ ccl_device_extern void name##_vv(ccl_private float3 *res, const float3 *v) \
+ { \
+ const float n = name##_fv(v); \
+ res->x = n; \
+ res->y = n; \
+ res->z = n; \
+ } \
+ ccl_device_extern void name##_vvf(ccl_private float3 *res, const float3 *v, float w) \
+ { \
+ const float n = name##_fvf(v, w); \
+ res->x = n; \
+ res->y = n; \
+ res->z = n; \
+ }
+
+ccl_device_forceinline float hashnoise_1d(float p)
+{
+ const uint x = __float_as_uint(p);
+ return hash_uint(x) / static_cast<float>(~0u);
+}
+ccl_device_forceinline float hashnoise_2d(float2 p)
+{
+ const uint x = __float_as_uint(p.x);
+ const uint y = __float_as_uint(p.y);
+ return hash_uint2(x, y) / static_cast<float>(~0u);
+}
+ccl_device_forceinline float hashnoise_3d(float3 p)
+{
+ const uint x = __float_as_uint(p.x);
+ const uint y = __float_as_uint(p.y);
+ const uint z = __float_as_uint(p.z);
+ return hash_uint3(x, y, z) / static_cast<float>(~0u);
+}
+ccl_device_forceinline float hashnoise_4d(float4 p)
+{
+ const uint x = __float_as_uint(p.x);
+ const uint y = __float_as_uint(p.y);
+ const uint z = __float_as_uint(p.z);
+ const uint w = __float_as_uint(p.w);
+ return hash_uint4(x, y, z, w) / static_cast<float>(~0u);
+}
+
+/* TODO: Implement all noise functions */
+OSL_NOISE_IMPL(osl_hashnoise, hashnoise)
+OSL_NOISE_IMPL(osl_noise, noise)
+OSL_NOISE_IMPL(osl_snoise, snoise)
+
+/* Texturing */
+
+ccl_device_extern ccl_private OSLTextureOptions *osl_get_texture_options(
+ ccl_private ShaderGlobals *sg)
+{
+ return nullptr;
+}
+
+ccl_device_extern void osl_texture_set_firstchannel(ccl_private OSLTextureOptions *opt,
+ int firstchannel)
+{
+}
+
+ccl_device_extern void osl_texture_set_swrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_twrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_rwrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_stwrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_sblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_tblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_rblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_stblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_swidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_twidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_rwidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_stwidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_fill(ccl_private OSLTextureOptions *opt, float fill)
+{
+}
+
+ccl_device_extern void osl_texture_set_time(ccl_private OSLTextureOptions *opt, float time)
+{
+}
+
+ccl_device_extern void osl_texture_set_interp_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_subimage(ccl_private OSLTextureOptions *opt, int subimage)
+{
+}
+
+ccl_device_extern void osl_texture_set_missingcolor_arena(ccl_private OSLTextureOptions *opt,
+ ccl_private float3 *color)
+{
+}
+
+ccl_device_extern void osl_texture_set_missingcolor_alpha(ccl_private OSLTextureOptions *opt,
+ int nchannels,
+ float alpha)
+{
+}
+
+ccl_device_extern bool osl_texture(ccl_private ShaderGlobals *sg,
+ DeviceString filename,
+ ccl_private void *texture_handle,
+ OSLTextureOptions *opt,
+ float s,
+ float t,
+ float dsdx,
+ float dtdx,
+ float dsdy,
+ float dtdy,
+ int nchannels,
+ ccl_private float *result,
+ ccl_private float *dresultdx,
+ ccl_private float *dresultdy,
+ ccl_private float *alpha,
+ ccl_private float *dalphadx,
+ ccl_private float *dalphady,
+ ccl_private void *errormessage)
+{
+ if (!texture_handle) {
+ return false;
+ }
+
+ /* Only SVM textures are supported. */
+ int id = static_cast<int>(reinterpret_cast<size_t>(texture_handle) - 1);
+
+ const float4 rgba = kernel_tex_image_interp(nullptr, id, s, 1.0f - t);
+
+ result[0] = rgba.x;
+ if (nchannels > 1)
+ result[1] = rgba.y;
+ if (nchannels > 2)
+ result[2] = rgba.z;
+ if (nchannels > 3)
+ result[3] = rgba.w;
+
+ return true;
+}
+
+ccl_device_extern bool osl_texture3d(ccl_private ShaderGlobals *sg,
+ DeviceString filename,
+ ccl_private void *texture_handle,
+ OSLTextureOptions *opt,
+ ccl_private const float3 *P,
+ ccl_private const float3 *dPdx,
+ ccl_private const float3 *dPdy,
+ ccl_private const float3 *dPdz,
+ int nchannels,
+ ccl_private float *result,
+ ccl_private float *dresultds,
+ ccl_private float *dresultdt,
+ ccl_private float *alpha,
+ ccl_private float *dalphadx,
+ ccl_private float *dalphady,
+ ccl_private void *errormessage)
+{
+ if (!texture_handle) {
+ return false;
+ }
+
+ /* Only SVM textures are supported. */
+ int id = static_cast<int>(reinterpret_cast<size_t>(texture_handle) - 1);
+
+ const float4 rgba = kernel_tex_image_interp_3d(nullptr, id, *P, INTERPOLATION_NONE);
+
+ result[0] = rgba.x;
+ if (nchannels > 1)
+ result[1] = rgba.y;
+ if (nchannels > 2)
+ result[2] = rgba.z;
+ if (nchannels > 3)
+ result[3] = rgba.w;
+
+ return true;
+}
+
+ccl_device_extern bool osl_environment(ccl_private ShaderGlobals *sg,
+ DeviceString filename,
+ ccl_private void *texture_handle,
+ OSLTextureOptions *opt,
+ ccl_private const float3 *R,
+ ccl_private const float3 *dRdx,
+ ccl_private const float3 *dRdy,
+ int nchannels,
+ ccl_private float *result,
+ ccl_private float *dresultds,
+ ccl_private float *dresultdt,
+ ccl_private float *alpha,
+ ccl_private float *dalphax,
+ ccl_private float *dalphay,
+ ccl_private void *errormessage)
+{
+ result[0] = 1.0f;
+ if (nchannels > 1)
+ result[1] = 0.0f;
+ if (nchannels > 2)
+ result[2] = 1.0f;
+ if (nchannels > 3)
+ result[3] = 1.0f;
+
+ return false;
+}
+
+ccl_device_extern bool osl_get_textureinfo(ccl_private ShaderGlobals *sg,
+ DeviceString filename,
+ ccl_private void *texture_handle,
+ DeviceString dataname,
+ int basetype,
+ int arraylen,
+ int aggegrate,
+ ccl_private void *data,
+ ccl_private void *errormessage)
+{
+ return false;
+}
+
+ccl_device_extern bool osl_get_textureinfo_st(ccl_private ShaderGlobals *sg,
+ DeviceString filename,
+ ccl_private void *texture_handle,
+ float s,
+ float t,
+ DeviceString dataname,
+ int basetype,
+ int arraylen,
+ int aggegrate,
+ ccl_private void *data,
+ ccl_private void *errormessage)
+{
+ return osl_get_textureinfo(
+ sg, filename, texture_handle, dataname, basetype, arraylen, aggegrate, data, errormessage);
+}
+
+/* Standard library */
+
+#define OSL_OP_IMPL_II(name, op) \
+ ccl_device_extern int name##_ii(int a) \
+ { \
+ return op(a); \
+ }
+#define OSL_OP_IMPL_IF(name, op) \
+ ccl_device_extern int name##_if(float a) \
+ { \
+ return op(a); \
+ }
+#define OSL_OP_IMPL_FF(name, op) \
+ ccl_device_extern float name##_ff(float a) \
+ { \
+ return op(a); \
+ }
+#define OSL_OP_IMPL_DFDF(name, op) \
+ ccl_device_extern void name##_dfdf(ccl_private float *res, ccl_private const float *a) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DFDV(name, op) \
+ ccl_device_extern void name##_dfdv(ccl_private float *res, ccl_private const float3 *a) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_FV(name, op) \
+ ccl_device_extern float name##_fv(ccl_private const float3 *a) \
+ { \
+ return op(*a); \
+ }
+#define OSL_OP_IMPL_VV(name, op) \
+ ccl_device_extern void name##_vv(ccl_private float3 *res, ccl_private const float3 *a) \
+ { \
+ *res = op(*a); \
+ }
+#define OSL_OP_IMPL_VV_(name, op) \
+ ccl_device_extern void name##_vv(ccl_private float3 *res, ccl_private const float3 *a) \
+ { \
+ res->x = op(a->x); \
+ res->y = op(a->y); \
+ res->z = op(a->z); \
+ }
+#define OSL_OP_IMPL_DVDV(name, op) \
+ ccl_device_extern void name##_dvdv(ccl_private float3 *res, ccl_private const float3 *a) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DVDV_(name, op) \
+ ccl_device_extern void name##_dvdv(ccl_private float3 *res, ccl_private const float3 *a) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i].x = op(a[i].x); \
+ res[i].y = op(a[i].y); \
+ res[i].z = op(a[i].z); \
+ } \
+ }
+
+#define OSL_OP_IMPL_III(name, op) \
+ ccl_device_extern int name##_iii(int a, int b) \
+ { \
+ return op(a, b); \
+ }
+#define OSL_OP_IMPL_FFF(name, op) \
+ ccl_device_extern float name##_fff(float a, float b) \
+ { \
+ return op(a, b); \
+ }
+#define OSL_OP_IMPL_FVV(name, op) \
+ ccl_device_extern float name##_fvv(ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ return op(*a, *b); \
+ }
+#define OSL_OP_IMPL_DFFDF(name, op) \
+ ccl_device_extern void name##_dffdf( \
+ ccl_private float *res, float a, ccl_private const float *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a, b[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DFDFF(name, op) \
+ ccl_device_extern void name##_dfdff( \
+ ccl_private float *res, ccl_private const float *a, float b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i], b); \
+ } \
+ }
+#define OSL_OP_IMPL_DFDFDF(name, op) \
+ ccl_device_extern void name##_dfdfdf( \
+ ccl_private float *res, ccl_private const float *a, ccl_private const float *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i], b[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DFVDV(name, op) \
+ ccl_device_extern void name##_dfvdv( \
+ ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[0], b[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DFDVV(name, op) \
+ ccl_device_extern void name##_dfdvv( \
+ ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i], b[0]); \
+ } \
+ }
+#define OSL_OP_IMPL_DFDVDV(name, op) \
+ ccl_device_extern void name##_dfdvdv( \
+ ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i], b[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_VVF_(name, op) \
+ ccl_device_extern void name##_vvf( \
+ ccl_private float3 *res, ccl_private const float3 *a, float b) \
+ { \
+ res->x = op(a->x, b); \
+ res->y = op(a->y, b); \
+ res->z = op(a->z, b); \
+ }
+#define OSL_OP_IMPL_VVV(name, op) \
+ ccl_device_extern void name##_vvv( \
+ ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ *res = op(*a, *b); \
+ }
+#define OSL_OP_IMPL_VVV_(name, op) \
+ ccl_device_extern void name##_vvv( \
+ ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ res->x = op(a->x, b->x); \
+ res->y = op(a->y, b->y); \
+ res->z = op(a->z, b->z); \
+ }
+#define OSL_OP_IMPL_DVVDF_(name, op) \
+ ccl_device_extern void name##_dvvdf( \
+ ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i].x = op(a[0].x, b[i]); \
+ res[i].y = op(a[0].y, b[i]); \
+ res[i].z = op(a[0].z, b[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DVDVF_(name, op) \
+ ccl_device_extern void name##_dvdvf( \
+ ccl_private float3 *res, ccl_private const float3 *a, float b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i].x = op(a[i].x, b); \
+ res[i].y = op(a[i].y, b); \
+ res[i].z = op(a[i].z, b); \
+ } \
+ }
+#define OSL_OP_IMPL_DVVDV(name, op) \
+ ccl_device_extern void name##_dvvdv( \
+ ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[0], b[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DVVDV_(name, op) \
+ ccl_device_extern void name##_dvvdv( \
+ ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i].x = op(a[0].x, b[i].x); \
+ res[i].y = op(a[0].y, b[i].y); \
+ res[i].z = op(a[0].z, b[i].z); \
+ } \
+ }
+#define OSL_OP_IMPL_DVDVV(name, op) \
+ ccl_device_extern void name##_dvdvv( \
+ ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i], b[0]); \
+ } \
+ }
+#define OSL_OP_IMPL_DVDVV_(name, op) \
+ ccl_device_extern void name##_dvdvv( \
+ ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i].x = op(a[i].x, b[0].x); \
+ res[i].y = op(a[i].y, b[0].y); \
+ res[i].z = op(a[i].z, b[0].z); \
+ } \
+ }
+#define OSL_OP_IMPL_DVDVDF_(name, op) \
+ ccl_device_extern void name##_dvdvdf( \
+ ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i].x = op(a[i].x, b[i]); \
+ res[i].y = op(a[i].y, b[i]); \
+ res[i].z = op(a[i].z, b[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DVDVDV(name, op) \
+ ccl_device_extern void name##_dvdvdv( \
+ ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i], b[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DVDVDV_(name, op) \
+ ccl_device_extern void name##_dvdvdv( \
+ ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i].x = op(a[i].x, b[i].x); \
+ res[i].y = op(a[i].y, b[i].y); \
+ res[i].z = op(a[i].z, b[i].z); \
+ } \
+ }
+
+#define OSL_OP_IMPL_FFFF(name, op) \
+ ccl_device_extern float name##_ffff(float a, float b, float c) \
+ { \
+ return op(a, b, c); \
+ }
+#define OSL_OP_IMPL_DFFFDF(name, op) \
+ ccl_device_extern void name##_dfffdf( \
+ ccl_private float *res, float a, float b, ccl_private const float *c) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a, b, c[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DFFDFF(name, op) \
+ ccl_device_extern void name##_dffdff( \
+ ccl_private float *res, float a, ccl_private const float *b, float c) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a, b[i], c); \
+ } \
+ }
+#define OSL_OP_IMPL_DFFDFDF(name, op) \
+ ccl_device_extern void name##_dffdfdf( \
+ ccl_private float *res, float a, ccl_private const float *b, ccl_private const float *c) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a, b[i], c[i]); \
+ } \
+ }
+
+#define OSL_OP_IMPL_DFDFFF(name, op) \
+ ccl_device_extern void name##_dfdfff( \
+ ccl_private float *res, ccl_private const float *a, float b, float c) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i], b, c); \
+ } \
+ }
+#define OSL_OP_IMPL_DFDFFDF(name, op) \
+ ccl_device_extern void name##_dfdffdf( \
+ ccl_private float *res, ccl_private const float *a, float b, ccl_private const float *c) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i], b, c[i]); \
+ } \
+ }
+#define OSL_OP_IMPL_DFDFDFF(name, op) \
+ ccl_device_extern void name##_dfdfdff( \
+ ccl_private float *res, ccl_private const float *a, ccl_private const float *b, float c) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i], b[i], c); \
+ } \
+ }
+#define OSL_OP_IMPL_DFDFDFDF(name, op) \
+ ccl_device_extern void name##_dfdfdfdf(ccl_private float *res, \
+ ccl_private const float *a, \
+ ccl_private const float *b, \
+ ccl_private const float *c) \
+ { \
+ for (int i = 0; i < 3; ++i) { \
+ res[i] = op(a[i], b[i], c[i]); \
+ } \
+ }
+
+#define OSL_OP_IMPL_XX(name, op) \
+ OSL_OP_IMPL_FF(name, op) \
+ OSL_OP_IMPL_DFDF(name, op) \
+ OSL_OP_IMPL_VV_(name, op) \
+ OSL_OP_IMPL_DVDV_(name, op)
+
+#define OSL_OP_IMPL_XXX(name, op) \
+ OSL_OP_IMPL_FFF(name, op) \
+ OSL_OP_IMPL_DFFDF(name, op) \
+ OSL_OP_IMPL_DFDFF(name, op) \
+ OSL_OP_IMPL_DFDFDF(name, op) \
+ OSL_OP_IMPL_VVV_(name, op) \
+ OSL_OP_IMPL_DVVDV_(name, op) \
+ OSL_OP_IMPL_DVDVV_(name, op) \
+ OSL_OP_IMPL_DVDVDV_(name, op)
+
+OSL_OP_IMPL_XX(osl_acos, acosf)
+OSL_OP_IMPL_XX(osl_asin, asinf)
+OSL_OP_IMPL_XX(osl_atan, atanf)
+OSL_OP_IMPL_XXX(osl_atan2, atan2f)
+OSL_OP_IMPL_XX(osl_cos, cosf)
+OSL_OP_IMPL_XX(osl_sin, sinf)
+OSL_OP_IMPL_XX(osl_tan, tanf)
+OSL_OP_IMPL_XX(osl_cosh, coshf)
+OSL_OP_IMPL_XX(osl_sinh, sinhf)
+OSL_OP_IMPL_XX(osl_tanh, tanhf)
+
+ccl_device_forceinline int safe_divide(int a, int b)
+{
+ return (b != 0) ? a / b : 0;
+}
+ccl_device_forceinline int safe_modulo(int a, int b)
+{
+ return (b != 0) ? a % b : 0;
+}
+
+OSL_OP_IMPL_III(osl_safe_div, safe_divide)
+OSL_OP_IMPL_FFF(osl_safe_div, safe_divide)
+OSL_OP_IMPL_III(osl_safe_mod, safe_modulo)
+
+ccl_device_extern void osl_sincos_fff(float a, ccl_private float *b, ccl_private float *c)
+{
+ sincos(a, b, c);
+}
+ccl_device_extern void osl_sincos_dfdff(ccl_private const float *a,
+ ccl_private float *b,
+ ccl_private float *c)
+{
+ for (int i = 0; i < 3; ++i)
+ sincos(a[i], b + i, c);
+}
+ccl_device_extern void osl_sincos_dffdf(ccl_private const float *a,
+ ccl_private float *b,
+ ccl_private float *c)
+{
+ for (int i = 0; i < 3; ++i)
+ sincos(a[i], b, c + i);
+}
+ccl_device_extern void osl_sincos_dfdfdf(ccl_private const float *a,
+ ccl_private float *b,
+ ccl_private float *c)
+{
+ for (int i = 0; i < 3; ++i)
+ sincos(a[i], b + i, c + i);
+}
+ccl_device_extern void osl_sincos_vvv(ccl_private const float3 *a,
+ ccl_private float3 *b,
+ ccl_private float3 *c)
+{
+ sincos(a->x, &b->x, &c->x);
+ sincos(a->y, &b->y, &c->y);
+ sincos(a->z, &b->z, &c->z);
+}
+ccl_device_extern void osl_sincos_dvdvv(ccl_private const float3 *a,
+ ccl_private float3 *b,
+ ccl_private float3 *c)
+{
+ for (int i = 0; i < 3; ++i) {
+ sincos(a[i].x, &b[i].x, &c->x);
+ sincos(a[i].y, &b[i].y, &c->y);
+ sincos(a[i].z, &b[i].z, &c->z);
+ }
+}
+ccl_device_extern void osl_sincos_dvvdv(ccl_private const float3 *a,
+ ccl_private float3 *b,
+ ccl_private float3 *c)
+{
+ for (int i = 0; i < 3; ++i) {
+ sincos(a[i].x, &b->x, &c[i].x);
+ sincos(a[i].y, &b->y, &c[i].y);
+ sincos(a[i].z, &b->z, &c[i].z);
+ }
+}
+ccl_device_extern void osl_sincos_dvdvdv(ccl_private const float3 *a,
+ ccl_private float3 *b,
+ ccl_private float3 *c)
+{
+ for (int i = 0; i < 3; ++i) {
+ sincos(a[i].x, &b[i].x, &c[i].x);
+ sincos(a[i].y, &b[i].y, &c[i].y);
+ sincos(a[i].z, &b[i].z, &c[i].z);
+ }
+}
+
+OSL_OP_IMPL_XX(osl_log, logf)
+OSL_OP_IMPL_XX(osl_log2, log2f)
+OSL_OP_IMPL_XX(osl_log10, log10f)
+OSL_OP_IMPL_XX(osl_exp, expf)
+OSL_OP_IMPL_XX(osl_exp2, exp2f)
+OSL_OP_IMPL_XX(osl_expm1, expm1f)
+OSL_OP_IMPL_XX(osl_erf, erff)
+OSL_OP_IMPL_XX(osl_erfc, erfcf)
+
+OSL_OP_IMPL_XXX(osl_pow, safe_powf)
+OSL_OP_IMPL_VVF_(osl_pow, safe_powf)
+OSL_OP_IMPL_DVVDF_(osl_pow, safe_powf)
+OSL_OP_IMPL_DVDVF_(osl_pow, safe_powf)
+OSL_OP_IMPL_DVDVDF_(osl_pow, safe_powf)
+
+OSL_OP_IMPL_XX(osl_sqrt, sqrtf)
+OSL_OP_IMPL_XX(osl_inversesqrt, 1.0f / sqrtf)
+OSL_OP_IMPL_XX(osl_cbrt, cbrtf)
+
+OSL_OP_IMPL_FF(osl_logb, logbf)
+OSL_OP_IMPL_VV_(osl_logb, logbf)
+
+OSL_OP_IMPL_FF(osl_floor, floorf)
+OSL_OP_IMPL_VV_(osl_floor, floorf)
+OSL_OP_IMPL_FF(osl_ceil, ceilf)
+OSL_OP_IMPL_VV_(osl_ceil, ceilf)
+OSL_OP_IMPL_FF(osl_round, roundf)
+OSL_OP_IMPL_VV_(osl_round, roundf)
+OSL_OP_IMPL_FF(osl_trunc, truncf)
+OSL_OP_IMPL_VV_(osl_trunc, truncf)
+
+ccl_device_forceinline float step_impl(float edge, float x)
+{
+ return x < edge ? 0.0f : 1.0f;
+}
+
+OSL_OP_IMPL_FF(osl_sign, compatible_signf)
+OSL_OP_IMPL_VV_(osl_sign, compatible_signf)
+OSL_OP_IMPL_FFF(osl_step, step_impl)
+OSL_OP_IMPL_VVV_(osl_step, step_impl)
+
+OSL_OP_IMPL_IF(osl_isnan, isnan)
+OSL_OP_IMPL_IF(osl_isinf, isinf)
+OSL_OP_IMPL_IF(osl_isfinite, isfinite)
+
+OSL_OP_IMPL_II(osl_abs, abs)
+OSL_OP_IMPL_XX(osl_abs, fabsf)
+OSL_OP_IMPL_II(osl_fabs, abs)
+OSL_OP_IMPL_XX(osl_fabs, fabsf)
+OSL_OP_IMPL_XXX(osl_fmod, safe_modulo)
+
+OSL_OP_IMPL_FFFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFFFDF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFFDFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFFDFDF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFFDF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFDFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFDFDF(osl_smoothstep, smoothstep)
+
+OSL_OP_IMPL_FVV(osl_dot, dot)
+OSL_OP_IMPL_DFDVV(osl_dot, dot)
+OSL_OP_IMPL_DFVDV(osl_dot, dot)
+OSL_OP_IMPL_DFDVDV(osl_dot, dot)
+OSL_OP_IMPL_VVV(osl_cross, cross)
+OSL_OP_IMPL_DVDVV(osl_cross, cross)
+OSL_OP_IMPL_DVVDV(osl_cross, cross)
+OSL_OP_IMPL_DVDVDV(osl_cross, cross)
+OSL_OP_IMPL_FV(osl_length, len)
+OSL_OP_IMPL_DFDV(osl_length, len)
+OSL_OP_IMPL_FVV(osl_distance, distance)
+OSL_OP_IMPL_DFDVV(osl_distance, distance)
+OSL_OP_IMPL_DFVDV(osl_distance, distance)
+OSL_OP_IMPL_DFDVDV(osl_distance, distance)
+OSL_OP_IMPL_VV(osl_normalize, safe_normalize)
+OSL_OP_IMPL_DVDV(osl_normalize, safe_normalize)
+
+ccl_device_extern void osl_calculatenormal(ccl_private float3 *res,
+ ccl_private ShaderGlobals *sg,
+ ccl_private const float3 *p)
+{
+ if (sg->flipHandedness)
+ *res = cross(p[2], p[1]);
+ else
+ *res = cross(p[1], p[2]);
+}
+
+ccl_device_extern float osl_area(ccl_private const float3 *p)
+{
+ return len(cross(p[2], p[1]));
+}
+
+ccl_device_extern float osl_filterwidth_fdf(ccl_private const float *x)
+{
+ return sqrtf(x[1] * x[1] + x[2] * x[2]);
+}
+
+ccl_device_extern void osl_filterwidth_vdv(ccl_private float *res, ccl_private const float *x)
+{
+ for (int i = 0; i < 3; ++i)
+ res[i] = osl_filterwidth_fdf(x + i);
+}
+
+ccl_device_extern bool osl_raytype_bit(ccl_private ShaderGlobals *sg, int bit)
+{
+ return (sg->raytype & bit) != 0;
+}
diff --git a/intern/cycles/kernel/osl/services_optix.cu b/intern/cycles/kernel/osl/services_optix.cu
new file mode 100644
index 00000000000..2a43a89a956
--- /dev/null
+++ b/intern/cycles/kernel/osl/services_optix.cu
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+// clang-format off
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h" /* Texture lookup uses normal CUDA intrinsics. */
+
+#include "kernel/osl/services_gpu.h"
+// clang-format on
+
+extern "C" __device__ void __direct_callable__dummy_services()
+{
+}
diff --git a/intern/cycles/kernel/osl/shaders/node_geometry.osl b/intern/cycles/kernel/osl/shaders/node_geometry.osl
index cc891abd6e3..5d9284deac2 100644
--- a/intern/cycles/kernel/osl/shaders/node_geometry.osl
+++ b/intern/cycles/kernel/osl/shaders/node_geometry.osl
@@ -3,8 +3,7 @@
#include "stdcycles.h"
-shader node_geometry(normal NormalIn = N,
- string bump_offset = "center",
+shader node_geometry(string bump_offset = "center",
output point Position = point(0.0, 0.0, 0.0),
output normal Normal = normal(0.0, 0.0, 0.0),
@@ -17,7 +16,7 @@ shader node_geometry(normal NormalIn = N,
output float RandomPerIsland = 0.0)
{
Position = P;
- Normal = NormalIn;
+ Normal = N;
TrueNormal = Ng;
Incoming = I;
Parametric = point(1.0 - u - v, u, 0.0);
diff --git a/intern/cycles/kernel/osl/shaders/node_normal_map.osl b/intern/cycles/kernel/osl/shaders/node_normal_map.osl
index 3cda485c686..7e41bbf1720 100644
--- a/intern/cycles/kernel/osl/shaders/node_normal_map.osl
+++ b/intern/cycles/kernel/osl/shaders/node_normal_map.osl
@@ -3,13 +3,12 @@
#include "stdcycles.h"
-shader node_normal_map(normal NormalIn = N,
- float Strength = 1.0,
+shader node_normal_map(float Strength = 1.0,
color Color = color(0.5, 0.5, 1.0),
string space = "tangent",
string attr_name = "geom:tangent",
string attr_sign_name = "geom:tangent_sign",
- output normal Normal = NormalIn)
+ output normal Normal = N)
{
color mcolor = 2.0 * color(Color[0] - 0.5, Color[1] - 0.5, Color[2] - 0.5);
int is_backfacing = backfacing();
@@ -71,5 +70,5 @@ shader node_normal_map(normal NormalIn = N,
}
if (Strength != 1.0)
- Normal = normalize(NormalIn + (Normal - NormalIn) * max(Strength, 0.0));
+ Normal = normalize(N + (Normal - N) * max(Strength, 0.0));
}
diff --git a/intern/cycles/kernel/osl/shaders/node_tangent.osl b/intern/cycles/kernel/osl/shaders/node_tangent.osl
index a302c001f08..b3808778b2f 100644
--- a/intern/cycles/kernel/osl/shaders/node_tangent.osl
+++ b/intern/cycles/kernel/osl/shaders/node_tangent.osl
@@ -3,8 +3,7 @@
#include "stdcycles.h"
-shader node_tangent(normal NormalIn = N,
- string attr_name = "geom:tangent",
+shader node_tangent(string attr_name = "geom:tangent",
string direction_type = "radial",
string axis = "z",
output normal Tangent = normalize(dPdu))
@@ -29,5 +28,5 @@ shader node_tangent(normal NormalIn = N,
}
T = transform("object", "world", T);
- Tangent = cross(NormalIn, normalize(cross(T, NormalIn)));
+ Tangent = cross(N, normalize(cross(T, N)));
}
diff --git a/intern/cycles/kernel/osl/shaders/node_texture_coordinate.osl b/intern/cycles/kernel/osl/shaders/node_texture_coordinate.osl
index 24875ce140a..cd2fdae3cb3 100644
--- a/intern/cycles/kernel/osl/shaders/node_texture_coordinate.osl
+++ b/intern/cycles/kernel/osl/shaders/node_texture_coordinate.osl
@@ -4,7 +4,6 @@
#include "stdcycles.h"
shader node_texture_coordinate(
- normal NormalIn = N,
int is_background = 0,
int is_volume = 0,
int from_dupli = 0,
@@ -27,7 +26,7 @@ shader node_texture_coordinate(
point Pcam = transform("camera", "world", point(0, 0, 0));
Camera = transform("camera", P + Pcam);
getattribute("NDC", Window);
- Normal = NormalIn;
+ Normal = N;
Reflection = I;
}
else {
@@ -59,8 +58,8 @@ shader node_texture_coordinate(
}
Camera = transform("camera", P);
Window = transform("NDC", P);
- Normal = transform("world", "object", NormalIn);
- Reflection = -reflect(I, NormalIn);
+ Normal = transform("world", "object", N);
+ Reflection = -reflect(I, N);
}
if (bump_offset == "dx") {
diff --git a/intern/cycles/kernel/osl/types.h b/intern/cycles/kernel/osl/types.h
index 46e06114360..717306a3d07 100644
--- a/intern/cycles/kernel/osl/types.h
+++ b/intern/cycles/kernel/osl/types.h
@@ -5,9 +5,53 @@
CCL_NAMESPACE_BEGIN
+struct DeviceString {
+#if defined(__KERNEL_GPU__)
+ /* Strings are represented by their hashes in CUDA and OptiX. */
+ size_t str_;
+
+ ccl_device_inline_method uint64_t hash() const
+ {
+ return str_;
+ }
+#elif defined(OPENIMAGEIO_USTRING_H)
+ ustring str_;
+
+ ccl_device_inline_method uint64_t hash() const
+ {
+ return str_.hash();
+ }
+#else
+ const char *str_;
+#endif
+
+ ccl_device_inline_method bool operator==(DeviceString b) const
+ {
+ return str_ == b.str_;
+ }
+ ccl_device_inline_method bool operator!=(DeviceString b) const
+ {
+ return str_ != b.str_;
+ }
+};
+
+ccl_device_inline DeviceString make_string(const char *str, size_t hash)
+{
+#if defined(__KERNEL_GPU__)
+ (void)str;
+ return {hash};
+#elif defined(OPENIMAGEIO_USTRING_H)
+ (void)hash;
+ return {ustring(str)};
+#else
+ (void)hash;
+ return {str};
+#endif
+}
+
/* Closure */
-enum ClosureTypeOSL {
+enum OSLClosureType {
OSL_CLOSURE_MUL_ID = -1,
OSL_CLOSURE_ADD_ID = -2,
@@ -17,4 +61,60 @@ enum ClosureTypeOSL {
#include "closures_template.h"
};
+struct OSLClosure {
+ OSLClosureType id;
+};
+
+struct ccl_align(8) OSLClosureMul : public OSLClosure
+{
+ packed_float3 weight;
+ ccl_private const OSLClosure *closure;
+};
+
+struct ccl_align(8) OSLClosureAdd : public OSLClosure
+{
+ ccl_private const OSLClosure *closureA;
+ ccl_private const OSLClosure *closureB;
+};
+
+struct ccl_align(8) OSLClosureComponent : public OSLClosure
+{
+ packed_float3 weight;
+};
+
+/* Globals */
+
+struct ShaderGlobals {
+ packed_float3 P, dPdx, dPdy;
+ packed_float3 dPdz;
+ packed_float3 I, dIdx, dIdy;
+ packed_float3 N;
+ packed_float3 Ng;
+ float u, dudx, dudy;
+ float v, dvdx, dvdy;
+ packed_float3 dPdu, dPdv;
+ float time;
+ float dtime;
+ packed_float3 dPdtime;
+ packed_float3 Ps, dPsdx, dPsdy;
+ ccl_private void *renderstate;
+ ccl_private void *tracedata;
+ ccl_private void *objdata;
+ void *context;
+ void *renderer;
+ ccl_private void *object2common;
+ ccl_private void *shader2common;
+ ccl_private OSLClosure *Ci;
+ float surfacearea;
+ int raytype;
+ int flipHandedness;
+ int backfacing;
+};
+
+struct OSLNoiseOptions {
+};
+
+struct OSLTextureOptions {
+};
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/noise.h b/intern/cycles/kernel/svm/noise.h
index 31e77d87413..209195a03f1 100644
--- a/intern/cycles/kernel/svm/noise.h
+++ b/intern/cycles/kernel/svm/noise.h
@@ -39,11 +39,11 @@ ccl_device_noinline_cpu float perlin_1d(float x)
}
/* 2D, 3D, and 4D noise can be accelerated using SSE, so we first check if
- * SSE is supported, that is, if __KERNEL_SSE2__ is defined. If it is not
+ * SSE is supported, that is, if __KERNEL_SSE__ is defined. If it is not
* supported, we do a standard implementation, but if it is supported, we
* do an implementation using SSE intrinsics.
*/
-#if !defined(__KERNEL_SSE2__)
+#if !defined(__KERNEL_SSE__)
/* ** Standard Implementation ** */
@@ -250,18 +250,18 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
/* SSE Bilinear Interpolation:
*
- * The function takes two ssef inputs:
+ * The function takes two float4 inputs:
* - p : Contains the values at the points (v0, v1, v2, v3).
* - f : Contains the values (x, y, _, _). The third and fourth values are unused.
*
* The interpolation is done in two steps:
* 1. Interpolate (v0, v1) and (v2, v3) along the x axis to get g (g0, g1).
* (v2, v3) is generated by moving v2 and v3 to the first and second
- * places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ * places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
* fourth values are unused.
* 2. Interpolate g0 and g1 along the y axis to get the final value.
- * g1 is generated by populating an ssef with the second value of g.
- * Only the first value is important in the final ssef.
+ * g1 is generated by populating an float4 with the second value of g.
+ * Only the first value is important in the final float4.
*
* v1 v3 g1
* @ + + + + @ @ y
@@ -272,27 +272,27 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
* v0 v2 g0
*
*/
-ccl_device_inline ssef bi_mix(ssef p, ssef f)
+ccl_device_inline float4 bi_mix(float4 p, float4 f)
{
- ssef g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
+ float4 g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
return mix(g, shuffle<1>(g), shuffle<1>(f));
}
-ccl_device_inline ssef fade(const ssef &t)
+ccl_device_inline float4 fade(const float4 t)
{
- ssef a = madd(t, 6.0f, -15.0f);
- ssef b = madd(t, a, 10.0f);
+ float4 a = madd(t, make_float4(6.0f), make_float4(-15.0f));
+ float4 b = madd(t, a, make_float4(10.0f));
return (t * t) * (t * b);
}
/* Negate val if the nth bit of h is 1. */
# define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))
-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y)
{
- ssei h = hash & 7;
- ssef u = select(h < 4, x, y);
- ssef v = 2.0f * select(h < 4, y, x);
+ int4 h = hash & 7;
+ float4 u = select(h < 4, x, y);
+ float4 v = 2.0f * select(h < 4, y, x);
return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
}
@@ -310,28 +310,28 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
*/
ccl_device_noinline_cpu float perlin_2d(float x, float y)
{
- ssei XY;
- ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
- ssef uv = fade(fxy);
+ int4 XY;
+ float4 fxy = floorfrac(make_float4(x, y, 0.0f, 0.0f), &XY);
+ float4 uv = fade(fxy);
- ssei XY1 = XY + 1;
- ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
- ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
+ int4 XY1 = XY + make_int4(1);
+ int4 X = shuffle<0, 0, 0, 0>(XY, XY1);
+ int4 Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
- ssei h = hash_ssei2(X, Y);
+ int4 h = hash_int4_2(X, Y);
- ssef fxy1 = fxy - 1.0f;
- ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
- ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
+ float4 fxy1 = fxy - make_float4(1.0f);
+ float4 fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
+ float4 fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
- ssef g = grad(h, fx, fy);
+ float4 g = grad(h, fx, fy);
return extract<0>(bi_mix(g, uv));
}
/* SSE Trilinear Interpolation:
*
- * The function takes three ssef inputs:
+ * The function takes three float4 inputs:
* - p : Contains the values at the points (v0, v1, v2, v3).
* - q : Contains the values at the points (v4, v5, v6, v7).
* - f : Contains the values (x, y, z, _). The fourth value is unused.
@@ -340,11 +340,11 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
* 1. Interpolate p and q along the x axis to get s (s0, s1, s2, s3).
* 2. Interpolate (s0, s1) and (s2, s3) along the y axis to get g (g0, g1).
* (s2, s3) is generated by moving v2 and v3 to the first and second
- * places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ * places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
* fourth values are unused.
* 3. Interpolate g0 and g1 along the z axis to get the final value.
- * g1 is generated by populating an ssef with the second value of g.
- * Only the first value is important in the final ssef.
+ * g1 is generated by populating an float4 with the second value of g.
+ * Only the first value is important in the final float4.
*
* v3 v7
* @ + + + + + + @ s3 @
@@ -362,10 +362,10 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
* @ + + + + + + @ @
* v0 v4 s0
*/
-ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
+ccl_device_inline float4 tri_mix(float4 p, float4 q, float4 f)
{
- ssef s = mix(p, q, shuffle<0>(f));
- ssef g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
+ float4 s = mix(p, q, shuffle<0>(f));
+ float4 g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
return mix(g, shuffle<1>(g), shuffle<2>(f));
}
@@ -374,24 +374,24 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
* supported, we do an SSE implementation, but if it is supported,
* we do an implementation using AVX intrinsics.
*/
-# if !defined(__KERNEL_AVX__)
+# if !defined(__KERNEL_AVX2__)
-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y, const float4 z)
{
- ssei h = hash & 15;
- ssef u = select(h < 8, x, y);
- ssef vt = select((h == 12) | (h == 14), x, z);
- ssef v = select(h < 4, y, vt);
+ int4 h = hash & 15;
+ float4 u = select(h < 8, x, y);
+ float4 vt = select((h == 12) | (h == 14), x, z);
+ float4 v = select(h < 4, y, vt);
return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
}
-ccl_device_inline ssef
-grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &w)
+ccl_device_inline float4
+grad(const int4 hash, const float4 x, const float4 y, const float4 z, const float4 w)
{
- ssei h = hash & 31;
- ssef u = select(h < 24, x, y);
- ssef v = select(h < 16, y, z);
- ssef s = select(h < 8, z, w);
+ int4 h = hash & 31;
+ float4 u = select(h < 24, x, y);
+ float4 v = select(h < 16, y, z);
+ float4 s = select(h < 8, z, w);
return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
}
@@ -401,7 +401,7 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &
* between two trilinear interpolations.
*
*/
-ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
+ccl_device_inline float4 quad_mix(float4 p, float4 q, float4 r, float4 s, float4 f)
{
return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
}
@@ -427,23 +427,23 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
*/
ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
{
- ssei XYZ;
- ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
- ssef uvw = fade(fxyz);
+ int4 XYZ;
+ float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+ float4 uvw = fade(fxyz);
- ssei XYZ1 = XYZ + 1;
- ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
- ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+ int4 XYZ1 = XYZ + make_int4(1);
+ int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+ int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
- ssei h1 = hash_ssei3(shuffle<0>(XYZ), Y, Z);
- ssei h2 = hash_ssei3(shuffle<0>(XYZ1), Y, Z);
+ int4 h1 = hash_int4_3(shuffle<0>(XYZ), Y, Z);
+ int4 h2 = hash_int4_3(shuffle<0>(XYZ1), Y, Z);
- ssef fxyz1 = fxyz - 1.0f;
- ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
- ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+ float4 fxyz1 = fxyz - make_float4(1.0f);
+ float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+ float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
- ssef g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
- ssef g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);
+ float4 g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
+ float4 g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);
return extract<0>(tri_mix(g1, g2, uvw));
}
@@ -481,29 +481,29 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
*/
ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
{
- ssei XYZW;
- ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
- ssef uvws = fade(fxyzw);
+ int4 XYZW;
+ float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW);
+ float4 uvws = fade(fxyzw);
- ssei XYZW1 = XYZW + 1;
- ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
- ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+ int4 XYZW1 = XYZW + make_int4(1);
+ int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+ int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
- ssei h1 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
- ssei h2 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));
+ int4 h1 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
+ int4 h2 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));
- ssei h3 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
- ssei h4 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));
+ int4 h3 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
+ int4 h4 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));
- ssef fxyzw1 = fxyzw - 1.0f;
- ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
- ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+ float4 fxyzw1 = fxyzw - make_float4(1.0f);
+ float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+ float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
- ssef g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
- ssef g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));
+ float4 g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
+ float4 g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));
- ssef g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
- ssef g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));
+ float4 g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
+ float4 g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));
return extract<0>(quad_mix(g1, g2, g3, g4, uvws));
}
@@ -512,22 +512,22 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
/* AVX Implementation */
-ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z)
+ccl_device_inline vfloat8 grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z)
{
- avxi h = hash & 15;
- avxf u = select(h < 8, x, y);
- avxf vt = select((h == 12) | (h == 14), x, z);
- avxf v = select(h < 4, y, vt);
+ vint8 h = hash & 15;
+ vfloat8 u = select(h < 8, x, y);
+ vfloat8 vt = select((h == 12) | (h == 14), x, z);
+ vfloat8 v = select(h < 4, y, vt);
return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
}
-ccl_device_inline avxf
-grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w)
+ccl_device_inline vfloat8
+grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z, const vfloat8 w)
{
- avxi h = hash & 31;
- avxf u = select(h < 24, x, y);
- avxf v = select(h < 16, y, z);
- avxf s = select(h < 8, z, w);
+ vint8 h = hash & 31;
+ vfloat8 u = select(h < 24, x, y);
+ vfloat8 v = select(h < 16, y, z);
+ vfloat8 s = select(h < 8, z, w);
return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
}
@@ -537,13 +537,13 @@ grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &
* 1. Interpolate p and q along the w axis to get s.
* 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final
* value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the
- * low and high ssef from s.
+ * low and high float4 from s.
*
*/
-ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
+ccl_device_inline float4 quad_mix(vfloat8 p, vfloat8 q, float4 f)
{
- ssef fv = shuffle<3>(f);
- avxf s = mix(p, q, avxf(fv, fv));
+ float4 fv = shuffle<3>(f);
+ vfloat8 s = mix(p, q, make_vfloat8(fv, fv));
return tri_mix(low(s), high(s), f);
}
@@ -565,25 +565,25 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
*/
ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
{
- ssei XYZ;
- ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
- ssef uvw = fade(fxyz);
+ int4 XYZ;
+ float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+ float4 uvw = fade(fxyz);
- ssei XYZ1 = XYZ + 1;
- ssei X = shuffle<0>(XYZ);
- ssei X1 = shuffle<0>(XYZ1);
- ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
- ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+ int4 XYZ1 = XYZ + make_int4(1);
+ int4 X = shuffle<0>(XYZ);
+ int4 X1 = shuffle<0>(XYZ1);
+ int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+ int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
- avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z));
+ vint8 h = hash_int8_3(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z));
- ssef fxyz1 = fxyz - 1.0f;
- ssef fx = shuffle<0>(fxyz);
- ssef fx1 = shuffle<0>(fxyz1);
- ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
- ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+ float4 fxyz1 = fxyz - make_float4(1.0f);
+ float4 fx = shuffle<0>(fxyz);
+ float4 fx1 = shuffle<0>(fxyz1);
+ float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+ float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
- avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz));
+ vfloat8 g = grad(h, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz));
return extract<0>(tri_mix(low(g), high(g), uvw));
}
@@ -617,31 +617,37 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
*/
ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
{
- ssei XYZW;
- ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
- ssef uvws = fade(fxyzw);
-
- ssei XYZW1 = XYZW + 1;
- ssei X = shuffle<0>(XYZW);
- ssei X1 = shuffle<0>(XYZW1);
- ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
- ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
- ssei W = shuffle<3>(XYZW);
- ssei W1 = shuffle<3>(XYZW1);
-
- avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W));
- avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1));
-
- ssef fxyzw1 = fxyzw - 1.0f;
- ssef fx = shuffle<0>(fxyzw);
- ssef fx1 = shuffle<0>(fxyzw1);
- ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
- ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
- ssef fw = shuffle<3>(fxyzw);
- ssef fw1 = shuffle<3>(fxyzw1);
-
- avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw));
- avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1));
+ int4 XYZW;
+ float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW);
+ float4 uvws = fade(fxyzw);
+
+ int4 XYZW1 = XYZW + make_int4(1);
+ int4 X = shuffle<0>(XYZW);
+ int4 X1 = shuffle<0>(XYZW1);
+ int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+ int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+ int4 W = shuffle<3>(XYZW);
+ int4 W1 = shuffle<3>(XYZW1);
+
+ vint8 h1 = hash_int8_4(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W, W));
+ vint8 h2 = hash_int8_4(
+ make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W1, W1));
+
+ float4 fxyzw1 = fxyzw - make_float4(1.0f);
+ float4 fx = shuffle<0>(fxyzw);
+ float4 fx1 = shuffle<0>(fxyzw1);
+ float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+ float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+ float4 fw = shuffle<3>(fxyzw);
+ float4 fw1 = shuffle<3>(fxyzw1);
+
+ vfloat8 g1 = grad(
+ h1, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz), make_vfloat8(fw, fw));
+ vfloat8 g2 = grad(h2,
+ make_vfloat8(fx, fx1),
+ make_vfloat8(fy, fy),
+ make_vfloat8(fz, fz),
+ make_vfloat8(fw1, fw1));
return extract<0>(quad_mix(g1, g2, uvws));
}
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 24c5a6a4540..a6f8914a9b8 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -75,10 +75,14 @@ CCL_NAMESPACE_BEGIN
#define __VOLUME__
/* Device specific features */
-#ifndef __KERNEL_GPU__
-# ifdef WITH_OSL
-# define __OSL__
+#ifdef WITH_OSL
+# define __OSL__
+# ifdef __KERNEL_OPTIX__
+/* Kernels with OSL support are built separately in OptiX and don't need SVM. */
+# undef __SVM__
# endif
+#endif
+#ifndef __KERNEL_GPU__
# ifdef WITH_PATH_GUIDING
# define __PATH_GUIDING__
# endif
@@ -917,9 +921,13 @@ typedef struct ccl_align(16) ShaderData
float ray_dP;
#ifdef __OSL__
+# ifdef __KERNEL_GPU__
+ ccl_private uint8_t *osl_closure_pool;
+# else
const struct KernelGlobalsCPU *osl_globals;
const struct IntegratorStateCPU *osl_path_state;
const struct IntegratorShadowStateCPU *osl_shadow_path_state;
+# endif
#endif
/* LCG state for closures that require additional random numbers. */
@@ -1529,6 +1537,9 @@ enum KernelFeatureFlag : uint32_t {
/* Path guiding. */
KERNEL_FEATURE_PATH_GUIDING = (1U << 26U),
+
+ /* OSL. */
+ KERNEL_FEATURE_OSL = (1U << 27U),
};
/* Shader node feature mask, to specialize shader evaluation for kernels. */
diff --git a/intern/cycles/scene/osl.cpp b/intern/cycles/scene/osl.cpp
index 93839facdbe..4dc5fb4edf7 100644
--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -38,16 +38,17 @@ OSL::TextureSystem *OSLShaderManager::ts_shared = NULL;
int OSLShaderManager::ts_shared_users = 0;
thread_mutex OSLShaderManager::ts_shared_mutex;
-OSL::ShadingSystem *OSLShaderManager::ss_shared = NULL;
-OSLRenderServices *OSLShaderManager::services_shared = NULL;
+OSL::ErrorHandler OSLShaderManager::errhandler;
+map<int, OSL::ShadingSystem *> OSLShaderManager::ss_shared;
int OSLShaderManager::ss_shared_users = 0;
thread_mutex OSLShaderManager::ss_shared_mutex;
thread_mutex OSLShaderManager::ss_mutex;
+
int OSLCompiler::texture_shared_unique_id = 0;
/* Shader Manager */
-OSLShaderManager::OSLShaderManager()
+OSLShaderManager::OSLShaderManager(Device *device) : device_(device)
{
texture_system_init();
shading_system_init();
@@ -107,11 +108,12 @@ void OSLShaderManager::device_update_specific(Device *device,
device_free(device, dscene, scene);
- /* set texture system */
- scene->image_manager->set_osl_texture_system((void *)ts);
+ /* set texture system (only on CPU devices, since GPU devices cannot use OIIO) */
+ if (device->info.type == DEVICE_CPU) {
+ scene->image_manager->set_osl_texture_system((void *)ts_shared);
+ }
/* create shaders */
- OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
Shader *background_shader = scene->background->get_shader(scene);
foreach (Shader *shader, scene->shaders) {
@@ -125,22 +127,34 @@ void OSLShaderManager::device_update_specific(Device *device,
* compile shaders alternating */
thread_scoped_lock lock(ss_mutex);
- OSLCompiler compiler(this, services, ss, scene);
- compiler.background = (shader == background_shader);
- compiler.compile(og, shader);
+ device->foreach_device(
+ [this, scene, shader, background = (shader == background_shader)](Device *sub_device) {
+ OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+ OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
+
+ OSLCompiler compiler(this, ss, scene);
+ compiler.background = background;
+ compiler.compile(og, shader);
+ });
if (shader->get_use_mis() && shader->has_surface_emission)
scene->light_manager->tag_update(scene, LightManager::SHADER_COMPILED);
}
/* setup shader engine */
- og->ss = ss;
- og->ts = ts;
- og->services = services;
-
int background_id = scene->shader_manager->get_shader_id(background_shader);
- og->background_state = og->surface_state[background_id & SHADER_MASK];
- og->use = true;
+
+ device->foreach_device([background_id](Device *sub_device) {
+ OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+ OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
+
+ og->ss = ss;
+ og->ts = ts_shared;
+ og->services = static_cast<OSLRenderServices *>(ss->renderer());
+
+ og->background_state = og->surface_state[background_id & SHADER_MASK];
+ og->use = true;
+ });
foreach (Shader *shader, scene->shaders)
shader->clear_modified();
@@ -148,8 +162,12 @@ void OSLShaderManager::device_update_specific(Device *device,
update_flags = UPDATE_NONE;
/* add special builtin texture types */
- services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
- services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
+ for (const auto &[device_type, ss] : ss_shared) {
+ OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer());
+
+ services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
+ services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
+ }
device_update_common(device, dscene, scene, progress);
@@ -166,26 +184,35 @@ void OSLShaderManager::device_update_specific(Device *device,
* is being freed after the Session is freed.
*/
thread_scoped_lock lock(ss_shared_mutex);
- ss->optimize_all_groups();
+ for (const auto &[device_type, ss] : ss_shared) {
+ ss->optimize_all_groups();
+ }
+ }
+
+ /* load kernels */
+ if (!device->load_osl_kernels()) {
+ progress.set_error(device->error_message());
}
}
void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
{
- OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
-
device_free_common(device, dscene, scene);
/* clear shader engine */
- og->use = false;
- og->ss = NULL;
- og->ts = NULL;
-
- og->surface_state.clear();
- og->volume_state.clear();
- og->displacement_state.clear();
- og->bump_state.clear();
- og->background_state.reset();
+ device->foreach_device([](Device *sub_device) {
+ OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+
+ og->use = false;
+ og->ss = NULL;
+ og->ts = NULL;
+
+ og->surface_state.clear();
+ og->volume_state.clear();
+ og->displacement_state.clear();
+ og->bump_state.clear();
+ og->background_state.reset();
+ });
}
void OSLShaderManager::texture_system_init()
@@ -193,7 +220,7 @@ void OSLShaderManager::texture_system_init()
/* create texture system, shared between different renders to reduce memory usage */
thread_scoped_lock lock(ts_shared_mutex);
- if (ts_shared_users == 0) {
+ if (ts_shared_users++ == 0) {
ts_shared = TextureSystem::create(true);
ts_shared->attribute("automip", 1);
@@ -203,24 +230,18 @@ void OSLShaderManager::texture_system_init()
/* effectively unlimited for now, until we support proper mipmap lookups */
ts_shared->attribute("max_memory_MB", 16384);
}
-
- ts = ts_shared;
- ts_shared_users++;
}
void OSLShaderManager::texture_system_free()
{
/* shared texture system decrease users and destroy if no longer used */
thread_scoped_lock lock(ts_shared_mutex);
- ts_shared_users--;
- if (ts_shared_users == 0) {
+ if (--ts_shared_users == 0) {
ts_shared->invalidate_all(true);
OSL::TextureSystem::destroy(ts_shared);
ts_shared = NULL;
}
-
- ts = NULL;
}
void OSLShaderManager::shading_system_init()
@@ -228,101 +249,105 @@ void OSLShaderManager::shading_system_init()
/* create shading system, shared between different renders to reduce memory usage */
thread_scoped_lock lock(ss_shared_mutex);
- if (ss_shared_users == 0) {
- /* Must use aligned new due to concurrent hash map. */
- services_shared = util_aligned_new<OSLRenderServices>(ts_shared);
+ device_->foreach_device([](Device *sub_device) {
+ const DeviceType device_type = sub_device->info.type;
- string shader_path = path_get("shader");
+ if (ss_shared_users++ == 0 || ss_shared.find(device_type) == ss_shared.end()) {
+ /* Must use aligned new due to concurrent hash map. */
+ OSLRenderServices *services = util_aligned_new<OSLRenderServices>(ts_shared, device_type);
+
+ string shader_path = path_get("shader");
# ifdef _WIN32
- /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can
- * operate with file paths with any character. This requires to use wide
- * char functions, but OSL uses old fashioned ANSI functions which means:
- *
- * - We have to convert our paths to ANSI before passing to OSL
- * - OSL can't be used when there's a multi-byte character in the path
- * to the shaders folder.
- */
- shader_path = string_to_ansi(shader_path);
+ /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can
+ * operate with file paths with any character. This requires to use wide
+ * char functions, but OSL uses old fashioned ANSI functions which means:
+ *
+ * - We have to convert our paths to ANSI before passing to OSL
+ * - OSL can't be used when there's a multi-byte character in the path
+ * to the shaders folder.
+ */
+ shader_path = string_to_ansi(shader_path);
# endif
- ss_shared = new OSL::ShadingSystem(services_shared, ts_shared, &errhandler);
- ss_shared->attribute("lockgeom", 1);
- ss_shared->attribute("commonspace", "world");
- ss_shared->attribute("searchpath:shader", shader_path);
- ss_shared->attribute("greedyjit", 1);
-
- VLOG_INFO << "Using shader search path: " << shader_path;
-
- /* our own ray types */
- static const char *raytypes[] = {
- "camera", /* PATH_RAY_CAMERA */
- "reflection", /* PATH_RAY_REFLECT */
- "refraction", /* PATH_RAY_TRANSMIT */
- "diffuse", /* PATH_RAY_DIFFUSE */
- "glossy", /* PATH_RAY_GLOSSY */
- "singular", /* PATH_RAY_SINGULAR */
- "transparent", /* PATH_RAY_TRANSPARENT */
- "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
-
- "shadow", /* PATH_RAY_SHADOW_OPAQUE */
- "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
-
- "__unused__", /* PATH_RAY_NODE_UNALIGNED */
- "__unused__", /* PATH_RAY_MIS_SKIP */
-
- "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
-
- /* Remaining irrelevant bits up to 32. */
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- "__unused__",
- };
-
- const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
- ss_shared->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes);
-
- OSLRenderServices::register_closures(ss_shared);
-
- loaded_shaders.clear();
- }
+ OSL::ShadingSystem *ss = new OSL::ShadingSystem(services, ts_shared, &errhandler);
+ ss->attribute("lockgeom", 1);
+ ss->attribute("commonspace", "world");
+ ss->attribute("searchpath:shader", shader_path);
+ ss->attribute("greedyjit", 1);
+
+ VLOG_INFO << "Using shader search path: " << shader_path;
+
+ /* our own ray types */
+ static const char *raytypes[] = {
+ "camera", /* PATH_RAY_CAMERA */
+ "reflection", /* PATH_RAY_REFLECT */
+ "refraction", /* PATH_RAY_TRANSMIT */
+ "diffuse", /* PATH_RAY_DIFFUSE */
+ "glossy", /* PATH_RAY_GLOSSY */
+ "singular", /* PATH_RAY_SINGULAR */
+ "transparent", /* PATH_RAY_TRANSPARENT */
+ "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
+
+ "shadow", /* PATH_RAY_SHADOW_OPAQUE */
+ "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
+
+ "__unused__", /* PATH_RAY_NODE_UNALIGNED */
+ "__unused__", /* PATH_RAY_MIS_SKIP */
+
+ "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
+
+ /* Remaining irrelevant bits up to 32. */
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ "__unused__",
+ };
+
+ const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
+ ss->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes);
+
+ OSLRenderServices::register_closures(ss);
+
+ ss_shared[device_type] = ss;
+ }
+ });
- ss = ss_shared;
- services = services_shared;
- ss_shared_users++;
+ loaded_shaders.clear();
}
void OSLShaderManager::shading_system_free()
{
/* shared shading system decrease users and destroy if no longer used */
thread_scoped_lock lock(ss_shared_mutex);
- ss_shared_users--;
- if (ss_shared_users == 0) {
- delete ss_shared;
- ss_shared = NULL;
+ device_->foreach_device([](Device * /*sub_device*/) {
+ if (--ss_shared_users == 0) {
+ for (const auto &[device_type, ss] : ss_shared) {
+ OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer());
- util_aligned_delete(services_shared);
- services_shared = NULL;
- }
+ delete ss;
+
+ util_aligned_delete(services);
+ }
- ss = NULL;
- services = NULL;
+ ss_shared.clear();
+ }
+ });
}
bool OSLShaderManager::osl_compile(const string &inputfile, const string &outputfile)
@@ -447,7 +472,9 @@ const char *OSLShaderManager::shader_load_filepath(string filepath)
const char *OSLShaderManager::shader_load_bytecode(const string &hash, const string &bytecode)
{
- ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str());
+ for (const auto &[device_type, ss] : ss_shared) {
+ ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str());
+ }
OSLShaderInfo info;
@@ -599,11 +626,11 @@ OSLNode *OSLShaderManager::osl_node(ShaderGraph *graph,
/* Graph Compiler */
-OSLCompiler::OSLCompiler(OSLShaderManager *manager,
- OSLRenderServices *services,
- OSL::ShadingSystem *ss,
- Scene *scene)
- : scene(scene), manager(manager), services(services), ss(ss)
+OSLCompiler::OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *ss, Scene *scene)
+ : scene(scene),
+ manager(manager),
+ services(static_cast<OSLRenderServices *>(ss->renderer())),
+ ss(ss)
{
current_type = SHADER_TYPE_SURFACE;
current_shader = NULL;
@@ -614,6 +641,8 @@ string OSLCompiler::id(ShaderNode *node)
{
/* assign layer unique name based on pointer address + bump mode */
stringstream stream;
+ stream.imbue(std::locale("C")); /* Ensure that no grouping characters (e.g. commas with en_US
+ locale) are added to the pointer string */
stream << "node_" << node->type->name << "_" << node;
return stream.str();
@@ -1105,7 +1134,12 @@ OSL::ShaderGroupRef OSLCompiler::compile_type(Shader *shader, ShaderGraph *graph
{
current_type = type;
- OSL::ShaderGroupRef group = ss->ShaderGroupBegin(shader->name.c_str());
+ /* Use name hash to identify shader group to avoid issues with non-alphanumeric characters */
+ stringstream name;
+ name.imbue(std::locale("C"));
+ name << "shader_" << shader->name.hash();
+
+ OSL::ShaderGroupRef group = ss->ShaderGroupBegin(name.str());
ShaderNode *output = graph->output();
ShaderNodeSet dependencies;
diff --git a/intern/cycles/scene/osl.h b/intern/cycles/scene/osl.h
index 76c6bd96ce1..c0e82a9dc8d 100644
--- a/intern/cycles/scene/osl.h
+++ b/intern/cycles/scene/osl.h
@@ -54,7 +54,7 @@ struct OSLShaderInfo {
class OSLShaderManager : public ShaderManager {
public:
- OSLShaderManager();
+ OSLShaderManager(Device *device);
~OSLShaderManager();
static void free_memory();
@@ -92,25 +92,22 @@ class OSLShaderManager : public ShaderManager {
const std::string &bytecode_hash = "",
const std::string &bytecode = "");
- protected:
+ private:
void texture_system_init();
void texture_system_free();
void shading_system_init();
void shading_system_free();
- OSL::ShadingSystem *ss;
- OSL::TextureSystem *ts;
- OSLRenderServices *services;
- OSL::ErrorHandler errhandler;
+ Device *device_;
map<string, OSLShaderInfo> loaded_shaders;
static OSL::TextureSystem *ts_shared;
static thread_mutex ts_shared_mutex;
static int ts_shared_users;
- static OSL::ShadingSystem *ss_shared;
- static OSLRenderServices *services_shared;
+ static OSL::ErrorHandler errhandler;
+ static map<int, OSL::ShadingSystem *> ss_shared;
static thread_mutex ss_shared_mutex;
static thread_mutex ss_mutex;
static int ss_shared_users;
@@ -123,10 +120,7 @@ class OSLShaderManager : public ShaderManager {
class OSLCompiler {
public:
#ifdef WITH_OSL
- OSLCompiler(OSLShaderManager *manager,
- OSLRenderServices *services,
- OSL::ShadingSystem *shadingsys,
- Scene *scene);
+ OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *shadingsys, Scene *scene);
#endif
void compile(OSLGlobals *og, Shader *shader);
diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp
index 3a05bede7a3..d5be86e1db9 100644
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@@ -99,11 +99,8 @@ Scene::Scene(const SceneParams &params_, Device *device)
{
memset((void *)&dscene.data, 0, sizeof(dscene.data));
- /* OSL only works on the CPU */
- if (device->info.has_osl)
- shader_manager = ShaderManager::create(params.shadingsystem);
- else
- shader_manager = ShaderManager::create(SHADINGSYSTEM_SVM);
+ shader_manager = ShaderManager::create(
+ device->info.has_osl ? params.shadingsystem : SHADINGSYSTEM_SVM, device);
light_manager = new LightManager();
geometry_manager = new GeometryManager();
diff --git a/intern/cycles/scene/shader.cpp b/intern/cycles/scene/shader.cpp
index 56670c6e4e3..f176c19ec95 100644
--- a/intern/cycles/scene/shader.cpp
+++ b/intern/cycles/scene/shader.cpp
@@ -395,15 +395,16 @@ ShaderManager::~ShaderManager()
{
}
-ShaderManager *ShaderManager::create(int shadingsystem)
+ShaderManager *ShaderManager::create(int shadingsystem, Device *device)
{
ShaderManager *manager;
(void)shadingsystem; /* Ignored when built without OSL. */
+ (void)device;
#ifdef WITH_OSL
if (shadingsystem == SHADINGSYSTEM_OSL) {
- manager = new OSLShaderManager();
+ manager = new OSLShaderManager(device);
}
else
#endif
@@ -722,6 +723,10 @@ uint ShaderManager::get_kernel_features(Scene *scene)
}
}
+ if (use_osl()) {
+ kernel_features |= KERNEL_FEATURE_OSL;
+ }
+
return kernel_features;
}
diff --git a/intern/cycles/scene/shader.h b/intern/cycles/scene/shader.h
index 2670776aca4..69b22d2ad19 100644
--- a/intern/cycles/scene/shader.h
+++ b/intern/cycles/scene/shader.h
@@ -170,7 +170,7 @@ class ShaderManager {
UPDATE_NONE = 0u,
};
- static ShaderManager *create(int shadingsystem);
+ static ShaderManager *create(int shadingsystem, Device *device);
virtual ~ShaderManager();
virtual void reset(Scene *scene) = 0;
diff --git a/intern/cycles/scene/shader_nodes.cpp b/intern/cycles/scene/shader_nodes.cpp
index a9cd453947b..2c1cd3ee737 100644
--- a/intern/cycles/scene/shader_nodes.cpp
+++ b/intern/cycles/scene/shader_nodes.cpp
@@ -3677,9 +3677,6 @@ NODE_DEFINE(GeometryNode)
{
NodeType *type = NodeType::add("geometry", create, NodeType::SHADER);
- SOCKET_IN_NORMAL(
- normal_osl, "NormalIn", zero_float3(), SocketType::LINK_NORMAL | SocketType::OSL_INTERNAL);
-
SOCKET_OUT_POINT(position, "Position");
SOCKET_OUT_NORMAL(normal, "Normal");
SOCKET_OUT_NORMAL(tangent, "Tangent");
@@ -3812,9 +3809,6 @@ NODE_DEFINE(TextureCoordinateNode)
SOCKET_BOOLEAN(use_transform, "Use Transform", false);
SOCKET_TRANSFORM(ob_tfm, "Object Transform", transform_identity());
- SOCKET_IN_NORMAL(
- normal_osl, "NormalIn", zero_float3(), SocketType::LINK_NORMAL | SocketType::OSL_INTERNAL);
-
SOCKET_OUT_POINT(generated, "Generated");
SOCKET_OUT_NORMAL(normal, "Normal");
SOCKET_OUT_POINT(UV, "UV");
@@ -7305,8 +7299,6 @@ NODE_DEFINE(NormalMapNode)
SOCKET_STRING(attribute, "Attribute", ustring());
- SOCKET_IN_NORMAL(
- normal_osl, "NormalIn", zero_float3(), SocketType::LINK_NORMAL | SocketType::OSL_INTERNAL);
SOCKET_IN_FLOAT(strength, "Strength", 1.0f);
SOCKET_IN_COLOR(color, "Color", make_float3(0.5f, 0.5f, 1.0f));
@@ -7400,8 +7392,6 @@ NODE_DEFINE(TangentNode)
SOCKET_STRING(attribute, "Attribute", ustring());
- SOCKET_IN_NORMAL(
- normal_osl, "NormalIn", zero_float3(), SocketType::LINK_NORMAL | SocketType::OSL_INTERNAL);
SOCKET_OUT_NORMAL(tangent, "Tangent");
return type;
diff --git a/intern/cycles/scene/shader_nodes.h b/intern/cycles/scene/shader_nodes.h
index cc3a71a0697..9b4d27b5b31 100644
--- a/intern/cycles/scene/shader_nodes.h
+++ b/intern/cycles/scene/shader_nodes.h
@@ -907,8 +907,6 @@ class GeometryNode : public ShaderNode {
return true;
}
int get_group();
-
- NODE_SOCKET_API(float3, normal_osl)
};
class TextureCoordinateNode : public ShaderNode {
@@ -924,7 +922,6 @@ class TextureCoordinateNode : public ShaderNode {
return true;
}
- NODE_SOCKET_API(float3, normal_osl)
NODE_SOCKET_API(bool, from_dupli)
NODE_SOCKET_API(bool, use_transform)
NODE_SOCKET_API(Transform, ob_tfm)
@@ -1542,6 +1539,10 @@ class OSLNode final : public ShaderNode {
{
return true;
}
+ virtual int get_feature()
+ {
+ return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_RAYTRACE;
+ }
virtual bool equals(const ShaderNode & /*other*/)
{
@@ -1569,7 +1570,6 @@ class NormalMapNode : public ShaderNode {
NODE_SOCKET_API(ustring, attribute)
NODE_SOCKET_API(float, strength)
NODE_SOCKET_API(float3, color)
- NODE_SOCKET_API(float3, normal_osl)
};
class TangentNode : public ShaderNode {
@@ -1588,7 +1588,6 @@ class TangentNode : public ShaderNode {
NODE_SOCKET_API(NodeTangentDirectionType, direction_type)
NODE_SOCKET_API(NodeTangentAxis, axis)
NODE_SOCKET_API(ustring, attribute)
- NODE_SOCKET_API(float3, normal_osl)
};
class BevelNode : public ShaderNode {
diff --git a/intern/cycles/session/merge.h b/intern/cycles/session/merge.h
index 702ca5c3eb8..d8a4f04a27b 100644
--- a/intern/cycles/session/merge.h
+++ b/intern/cycles/session/merge.h
@@ -9,7 +9,7 @@
CCL_NAMESPACE_BEGIN
-/* Merge OpenEXR multilayer renders. */
+/* Merge OpenEXR multi-layer renders. */
class ImageMerger {
public:
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index c3ae81ed1db..34e5a4770ea 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -45,17 +45,24 @@ set(SRC
# Disable AVX tests on macOS. Rosetta has problems running them, and other
# platforms should be enough to verify AVX operations are implemented correctly.
if(NOT APPLE)
+ if(CXX_HAS_SSE)
+ list(APPEND SRC
+ util_float8_sse2_test.cpp
+ )
+ set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+ endif()
+
if(CXX_HAS_AVX)
list(APPEND SRC
- util_avxf_avx_test.cpp
+ util_float8_avx_test.cpp
)
- set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+ set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
list(APPEND SRC
- util_avxf_avx2_test.cpp
+ util_float8_avx2_test.cpp
)
- set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+ set_source_files_properties(util_float8_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
endif()
diff --git a/intern/cycles/test/util_avxf_test.h b/intern/cycles/test/util_avxf_test.h
deleted file mode 100644
index 34d966cc1a4..00000000000
--- a/intern/cycles/test/util_avxf_test.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#include "testing/testing.h"
-#include "util/system.h"
-#include "util/types.h"
-
-CCL_NAMESPACE_BEGIN
-
-static bool validate_cpu_capabilities()
-{
-
-#ifdef __KERNEL_AVX2__
- return system_cpu_support_avx2();
-#else
-# ifdef __KERNEL_AVX__
- return system_cpu_support_avx();
-# endif
-#endif
-}
-
-#define INIT_AVX_TEST \
- if (!validate_cpu_capabilities()) \
- return; \
-\
- const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \
- const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \
- const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
-
-#define compare_vector_scalar(a, b) \
- for (size_t index = 0; index < a.size; index++) \
- EXPECT_FLOAT_EQ(a[index], b);
-
-#define compare_vector_vector(a, b) \
- for (size_t index = 0; index < a.size; index++) \
- EXPECT_FLOAT_EQ(a[index], b[index]);
-
-#define compare_vector_vector_near(a, b, abserror) \
- for (size_t index = 0; index < a.size; index++) \
- EXPECT_NEAR(a[index], b[index], abserror);
-
-#define basic_test_vv(a, b, op) \
- INIT_AVX_TEST \
- avxf c = a op b; \
- for (size_t i = 0; i < a.size; i++) \
- EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);
-
-/* vector op float tests */
-#define basic_test_vf(a, b, op) \
- INIT_AVX_TEST \
- avxf c = a op b; \
- for (size_t i = 0; i < a.size; i++) \
- EXPECT_FLOAT_EQ(c[i], a[i] op b);
-
-static const float float_b = 1.5f;
-
-TEST(TEST_CATEGORY_NAME, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(TEST_CATEGORY_NAME,
- avxf_sub_vv){
- basic_test_vv(avxf_a, avxf_b, -)} TEST(TEST_CATEGORY_NAME, avxf_mul_vv){
- basic_test_vv(avxf_a, avxf_b, *)} TEST(TEST_CATEGORY_NAME, avxf_div_vv){
- basic_test_vv(avxf_a, avxf_b, /)} TEST(TEST_CATEGORY_NAME, avxf_add_vf){
- basic_test_vf(avxf_a, float_b, +)} TEST(TEST_CATEGORY_NAME, avxf_sub_vf){
- basic_test_vf(avxf_a, float_b, -)} TEST(TEST_CATEGORY_NAME, avxf_mul_vf){
- basic_test_vf(avxf_a, float_b, *)} TEST(TEST_CATEGORY_NAME,
- avxf_div_vf){basic_test_vf(avxf_a, float_b, /)}
-
-TEST(TEST_CATEGORY_NAME, avxf_ctor)
-{
- INIT_AVX_TEST
- compare_vector_scalar(avxf(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f),
- static_cast<float>(index));
- compare_vector_scalar(avxf(1.0f), 1.0f);
- compare_vector_vector(avxf(1.0f, 2.0f), avxf(1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f));
- compare_vector_vector(avxf(1.0f, 2.0f, 3.0f, 4.0f),
- avxf(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f));
- compare_vector_vector(avxf(make_float3(1.0f, 2.0f, 3.0f)),
- avxf(0.0f, 3.0f, 2.0f, 1.0f, 0.0f, 3.0f, 2.0f, 1.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_sqrt)
-{
- INIT_AVX_TEST
- compare_vector_vector(mm256_sqrt(avxf(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
- avxf(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_min_max)
-{
- INIT_AVX_TEST
- compare_vector_vector(min(avxf_a, avxf_b), avxf_a);
- compare_vector_vector(max(avxf_a, avxf_b), avxf_b);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_set_sign)
-{
- INIT_AVX_TEST
- avxf res = set_sign_bit<1, 0, 0, 0, 0, 0, 0, 0>(avxf_a);
- compare_vector_vector(res, avxf(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, -0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_msub)
-{
- INIT_AVX_TEST
- avxf res = msub(avxf_a, avxf_b, avxf_c);
- avxf exp = avxf((avxf_a[7] * avxf_b[7]) - avxf_c[7],
- (avxf_a[6] * avxf_b[6]) - avxf_c[6],
- (avxf_a[5] * avxf_b[5]) - avxf_c[5],
- (avxf_a[4] * avxf_b[4]) - avxf_c[4],
- (avxf_a[3] * avxf_b[3]) - avxf_c[3],
- (avxf_a[2] * avxf_b[2]) - avxf_c[2],
- (avxf_a[1] * avxf_b[1]) - avxf_c[1],
- (avxf_a[0] * avxf_b[0]) - avxf_c[0]);
- compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_madd)
-{
- INIT_AVX_TEST
- avxf res = madd(avxf_a, avxf_b, avxf_c);
- avxf exp = avxf((avxf_a[7] * avxf_b[7]) + avxf_c[7],
- (avxf_a[6] * avxf_b[6]) + avxf_c[6],
- (avxf_a[5] * avxf_b[5]) + avxf_c[5],
- (avxf_a[4] * avxf_b[4]) + avxf_c[4],
- (avxf_a[3] * avxf_b[3]) + avxf_c[3],
- (avxf_a[2] * avxf_b[2]) + avxf_c[2],
- (avxf_a[1] * avxf_b[1]) + avxf_c[1],
- (avxf_a[0] * avxf_b[0]) + avxf_c[0]);
- compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_nmadd)
-{
- INIT_AVX_TEST
- avxf res = nmadd(avxf_a, avxf_b, avxf_c);
- avxf exp = avxf(avxf_c[7] - (avxf_a[7] * avxf_b[7]),
- avxf_c[6] - (avxf_a[6] * avxf_b[6]),
- avxf_c[5] - (avxf_a[5] * avxf_b[5]),
- avxf_c[4] - (avxf_a[4] * avxf_b[4]),
- avxf_c[3] - (avxf_a[3] * avxf_b[3]),
- avxf_c[2] - (avxf_a[2] * avxf_b[2]),
- avxf_c[1] - (avxf_a[1] * avxf_b[1]),
- avxf_c[0] - (avxf_a[0] * avxf_b[0]));
- compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_compare)
-{
- INIT_AVX_TEST
- avxf a(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f);
- avxf b(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
- avxb res = a <= b;
- int exp[8] = {
- a[0] <= b[0] ? -1 : 0,
- a[1] <= b[1] ? -1 : 0,
- a[2] <= b[2] ? -1 : 0,
- a[3] <= b[3] ? -1 : 0,
- a[4] <= b[4] ? -1 : 0,
- a[5] <= b[5] ? -1 : 0,
- a[6] <= b[6] ? -1 : 0,
- a[7] <= b[7] ? -1 : 0,
- };
- compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_permute)
-{
- INIT_AVX_TEST
- avxf res = permute<3, 0, 1, 7, 6, 5, 2, 4>(avxf_b);
- compare_vector_vector(res, avxf(4.0f, 6.0f, 3.0f, 2.0f, 1.0f, 7.0f, 8.0f, 5.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_blend)
-{
- INIT_AVX_TEST
- avxf res = blend<0, 0, 1, 0, 1, 0, 1, 0>(avxf_a, avxf_b);
- compare_vector_vector(res, avxf(0.1f, 0.2f, 3.0f, 0.4f, 5.0f, 0.6f, 7.0f, 0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_shuffle)
-{
- INIT_AVX_TEST
- avxf res = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(avxf_a);
- compare_vector_vector(res, avxf(0.4f, 0.2f, 0.1f, 0.3f, 0.5f, 0.6f, 0.7f, 0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_cross)
-{
- INIT_AVX_TEST
- avxf res = cross(avxf_b, avxf_c);
- compare_vector_vector_near(res,
- avxf(0.0f,
- -9.5367432e-07f,
- 0.0f,
- 4.7683716e-07f,
- 0.0f,
- -3.8146973e-06f,
- 3.8146973e-06f,
- 3.8146973e-06f),
- 0.000002000f);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_dot3)
-{
- INIT_AVX_TEST
- float den, den2;
- dot3(avxf_a, avxf_b, den, den2);
- EXPECT_FLOAT_EQ(den, 14.9f);
- EXPECT_FLOAT_EQ(den2, 2.9f);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_avxf_avx2_test.cpp b/intern/cycles/test/util_float8_avx2_test.cpp
index 992c4d9a913..4682dce5b23 100644
--- a/intern/cycles/test/util_avxf_avx2_test.cpp
+++ b/intern/cycles/test/util_float8_avx2_test.cpp
@@ -1,11 +1,13 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
+#define __KERNEL_SSE__
+#define __KERNEL_AVX__
#define __KERNEL_AVX2__
#define TEST_CATEGORY_NAME util_avx2
#if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
defined(__AVX2__)
-# include "util_avxf_test.h"
+# include "util_float8_test.h"
#endif
diff --git a/intern/cycles/test/util_avxf_avx_test.cpp b/intern/cycles/test/util_float8_avx_test.cpp
index abb98cdfb38..34fe750e766 100644
--- a/intern/cycles/test/util_avxf_avx_test.cpp
+++ b/intern/cycles/test/util_float8_avx_test.cpp
@@ -1,11 +1,12 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
+#define __KERNEL_SSE__
#define __KERNEL_AVX__
#define TEST_CATEGORY_NAME util_avx
#if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
defined(__AVX__)
-# include "util_avxf_test.h"
+# include "util_float8_test.h"
#endif
diff --git a/intern/cycles/test/util_float8_sse2_test.cpp b/intern/cycles/test/util_float8_sse2_test.cpp
new file mode 100644
index 00000000000..ba8952a2b08
--- /dev/null
+++ b/intern/cycles/test/util_float8_sse2_test.cpp
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define __KERNEL_SSE__
+#define __KERNEL_SSE2__
+
+#define TEST_CATEGORY_NAME util_sse2
+
+#if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
+ defined(__SSE2__)
+# include "util_float8_test.h"
+#endif
diff --git a/intern/cycles/test/util_float8_test.h b/intern/cycles/test/util_float8_test.h
new file mode 100644
index 00000000000..54701afaf8b
--- /dev/null
+++ b/intern/cycles/test/util_float8_test.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "testing/testing.h"
+#include "util/math.h"
+#include "util/system.h"
+#include "util/types.h"
+
+CCL_NAMESPACE_BEGIN
+
+static bool validate_cpu_capabilities()
+{
+
+#if defined(__KERNEL_AVX2__)
+ return system_cpu_support_avx2();
+#elif defined(__KERNEL_AVX__)
+ return system_cpu_support_avx();
+#elif defined(__KERNEL_SSE2__)
+ return system_cpu_support_sse2();
+#else
+ return false;
+#endif
+}
+
+#define INIT_FLOAT8_TEST \
+ if (!validate_cpu_capabilities()) \
+ return; \
+\
+ const vfloat8 float8_a = make_vfloat8(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \
+ const vfloat8 float8_b = make_vfloat8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \
+ const vfloat8 float8_c = make_vfloat8(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
+
+#define compare_vector_scalar(a, b) \
+ for (size_t index = 0; index < 8; index++) \
+ EXPECT_FLOAT_EQ(a[index], b);
+
+#define compare_vector_vector(a, b) \
+ for (size_t index = 0; index < 8; index++) \
+ EXPECT_FLOAT_EQ(a[index], b[index]);
+
+#define compare_vector_vector_near(a, b, abserror) \
+ for (size_t index = 0; index < 8; index++) \
+ EXPECT_NEAR(a[index], b[index], abserror);
+
+#define basic_test_vv(a, b, op) \
+ INIT_FLOAT8_TEST \
+ vfloat8 c = a op b; \
+ for (size_t i = 0; i < 8; i++) \
+ EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);
+
+/* vector op float tests */
+#define basic_test_vf(a, b, op) \
+ INIT_FLOAT8_TEST \
+ vfloat8 c = a op b; \
+ for (size_t i = 0; i < 8; i++) \
+ EXPECT_FLOAT_EQ(c[i], a[i] op b);
+
+static const float float_b = 1.5f;
+
+TEST(TEST_CATEGORY_NAME,
+ float8_add_vv){basic_test_vv(float8_a, float8_b, +)} TEST(TEST_CATEGORY_NAME, float8_sub_vv){
+ basic_test_vv(float8_a, float8_b, -)} TEST(TEST_CATEGORY_NAME, float8_mul_vv){
+ basic_test_vv(float8_a, float8_b, *)} TEST(TEST_CATEGORY_NAME, float8_div_vv){
+ basic_test_vv(float8_a, float8_b, /)} TEST(TEST_CATEGORY_NAME, float8_add_vf){
+ basic_test_vf(float8_a, float_b, +)} TEST(TEST_CATEGORY_NAME, float8_sub_vf){
+ basic_test_vf(float8_a, float_b, -)} TEST(TEST_CATEGORY_NAME, float8_mul_vf){
+ basic_test_vf(float8_a, float_b, *)} TEST(TEST_CATEGORY_NAME,
+ float8_div_vf){basic_test_vf(float8_a, float_b, /)}
+
+TEST(TEST_CATEGORY_NAME, float8_ctor)
+{
+ INIT_FLOAT8_TEST
+ compare_vector_scalar(make_vfloat8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f),
+ static_cast<float>(index));
+ compare_vector_scalar(make_vfloat8(1.0f), 1.0f);
+}
+
+TEST(TEST_CATEGORY_NAME, float8_sqrt)
+{
+ INIT_FLOAT8_TEST
+ compare_vector_vector(sqrt(make_vfloat8(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
+ make_vfloat8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
+}
+
+TEST(TEST_CATEGORY_NAME, float8_min_max)
+{
+ INIT_FLOAT8_TEST
+ compare_vector_vector(min(float8_a, float8_b), float8_a);
+ compare_vector_vector(max(float8_a, float8_b), float8_b);
+}
+
+TEST(TEST_CATEGORY_NAME, float8_shuffle)
+{
+ INIT_FLOAT8_TEST
+ vfloat8 res0 = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(float8_a);
+ compare_vector_vector(res0, make_vfloat8(0.1f, 0.2f, 0.3f, 0.4f, 0.6f, 0.8f, 0.7f, 0.5f));
+ vfloat8 res1 = shuffle<3>(float8_a);
+ compare_vector_vector(res1, make_vfloat8(0.4f, 0.4f, 0.4f, 0.4f, 0.8f, 0.8f, 0.8f, 0.8f));
+ vfloat8 res2 = shuffle<3, 2, 1, 0>(float8_a, float8_b);
+ compare_vector_vector(res2, make_vfloat8(0.4f, 0.3f, 2.0f, 1.0f, 0.8f, 0.7f, 6.0f, 5.0f));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 57628f99e35..7f8f4a5ce76 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -69,6 +69,7 @@ set(SRC_HEADERS
math_int2.h
math_int3.h
math_int4.h
+ math_int8.h
math_matrix.h
md5.h
murmurhash.h
@@ -85,13 +86,7 @@ set(SRC_HEADERS
rect.h
set.h
simd.h
- avxf.h
- avxb.h
- avxi.h
semaphore.h
- sseb.h
- ssef.h
- ssei.h
stack_allocator.h
static_assert.h
stats.h
@@ -118,6 +113,8 @@ set(SRC_HEADERS
types_int3_impl.h
types_int4.h
types_int4_impl.h
+ types_int8.h
+ types_int8_impl.h
types_spectrum.h
types_uchar2.h
types_uchar2_impl.h
diff --git a/intern/cycles/util/avxb.h b/intern/cycles/util/avxb.h
deleted file mode 100644
index fa3cb565309..00000000000
--- a/intern/cycles/util/avxb.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_AVXB_H__
-#define __UTIL_AVXB_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxf;
-
-/*! 4-wide SSE bool type. */
-struct avxb {
- typedef avxb Mask; // mask type
- typedef avxf Float; // float type
-
- enum { size = 8 }; // number of SIMD elements
- union {
- __m256 m256;
- int32_t v[8];
- }; // data
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Constructors, Assignment & Cast Operators
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline avxb()
- {
- }
- __forceinline avxb(const avxb &other)
- {
- m256 = other.m256;
- }
- __forceinline avxb &operator=(const avxb &other)
- {
- m256 = other.m256;
- return *this;
- }
-
- __forceinline avxb(const __m256 input) : m256(input)
- {
- }
- __forceinline avxb(const __m128 &a, const __m128 &b)
- : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1))
- {
- }
- __forceinline operator const __m256 &(void) const
- {
- return m256;
- }
- __forceinline operator const __m256i(void) const
- {
- return _mm256_castps_si256(m256);
- }
- __forceinline operator const __m256d(void) const
- {
- return _mm256_castps_pd(m256);
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Constants
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline avxb(FalseTy) : m256(_mm256_setzero_ps())
- {
- }
- __forceinline avxb(TrueTy) : m256(_mm256_castsi256_ps(_mm256_set1_epi32(-1)))
- {
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Array Access
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline bool operator[](const size_t i) const
- {
- assert(i < 8);
- return (_mm256_movemask_ps(m256) >> i) & 1;
- }
- __forceinline int32_t &operator[](const size_t i)
- {
- assert(i < 8);
- return v[i];
- }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator!(const avxb &a)
-{
- return _mm256_xor_ps(a, avxb(True));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator&(const avxb &a, const avxb &b)
-{
- return _mm256_and_ps(a, b);
-}
-__forceinline const avxb operator|(const avxb &a, const avxb &b)
-{
- return _mm256_or_ps(a, b);
-}
-__forceinline const avxb operator^(const avxb &a, const avxb &b)
-{
- return _mm256_xor_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator&=(avxb &a, const avxb &b)
-{
- return a = a & b;
-}
-__forceinline const avxb operator|=(avxb &a, const avxb &b)
-{
- return a = a | b;
-}
-__forceinline const avxb operator^=(avxb &a, const avxb &b)
-{
- return a = a ^ b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator!=(const avxb &a, const avxb &b)
-{
- return _mm256_xor_ps(a, b);
-}
-__forceinline const avxb operator==(const avxb &a, const avxb &b)
-{
-#ifdef __KERNEL_AVX2__
- return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
-#else
- __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
- __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
- __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
- __m128i b_hi = _mm_castps_si128(_mm256_extractf128_ps(b, 1));
- __m128i c_lo = _mm_cmpeq_epi32(a_lo, b_lo);
- __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
- __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
- return _mm256_castsi256_ps(result);
-#endif
-}
-
-__forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
-{
-#if defined(__KERNEL_SSE41__)
- return _mm256_blendv_ps(f, t, m);
-#else
- return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb unpacklo(const avxb &a, const avxb &b)
-{
- return _mm256_unpacklo_ps(a, b);
-}
-__forceinline const avxb unpackhi(const avxb &a, const avxb &b)
-{
- return _mm256_unpackhi_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reduction Operations
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_SSE41__)
-__forceinline uint32_t popcnt(const avxb &a)
-{
- return _mm_popcnt_u32(_mm256_movemask_ps(a));
-}
-#else
-__forceinline uint32_t popcnt(const avxb &a)
-{
- return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
- bool(a[7]);
-}
-#endif
-
-__forceinline bool reduce_and(const avxb &a)
-{
- return _mm256_movemask_ps(a) == 0xf;
-}
-__forceinline bool reduce_or(const avxb &a)
-{
- return _mm256_movemask_ps(a) != 0x0;
-}
-__forceinline bool all(const avxb &b)
-{
- return _mm256_movemask_ps(b) == 0xf;
-}
-__forceinline bool any(const avxb &b)
-{
- return _mm256_movemask_ps(b) != 0x0;
-}
-__forceinline bool none(const avxb &b)
-{
- return _mm256_movemask_ps(b) == 0x0;
-}
-
-__forceinline uint32_t movemask(const avxb &a)
-{
- return _mm256_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_avxb(const char *label, const avxb &a)
-{
- printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/avxf.h b/intern/cycles/util/avxf.h
deleted file mode 100644
index 03a13f30490..00000000000
--- a/intern/cycles/util/avxf.h
+++ /dev/null
@@ -1,379 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2016 Intel Corporation */
-
-#ifndef __UTIL_AVXF_H__
-#define __UTIL_AVXF_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxb;
-
-struct avxf {
- typedef avxf Float;
-
- enum { size = 8 }; /* Number of SIMD elements. */
-
- union {
- __m256 m256;
- float f[8];
- int i[8];
- };
-
- __forceinline avxf()
- {
- }
- __forceinline avxf(const avxf &other)
- {
- m256 = other.m256;
- }
- __forceinline avxf &operator=(const avxf &other)
- {
- m256 = other.m256;
- return *this;
- }
-
- __forceinline avxf(const __m256 a) : m256(a)
- {
- }
- __forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps(a))
- {
- }
-
- __forceinline operator const __m256 &() const
- {
- return m256;
- }
- __forceinline operator __m256 &()
- {
- return m256;
- }
-
- __forceinline avxf(float a) : m256(_mm256_set1_ps(a))
- {
- }
-
- __forceinline avxf(float high32x4, float low32x4)
- : m256(_mm256_set_ps(
- high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4))
- {
- }
-
- __forceinline avxf(float a3, float a2, float a1, float a0)
- : m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0))
- {
- }
-
- __forceinline avxf(
- float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0)
- : m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0))
- {
- }
-
- __forceinline avxf(float3 a) : m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x))
- {
- }
-
- __forceinline avxf(int a3, int a2, int a1, int a0)
- {
- const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
- m256 = _mm256_castsi256_ps(foo);
- }
-
- __forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
- {
- const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
- m256 = _mm256_castsi256_ps(foo);
- }
-
- __forceinline avxf(__m128 a, __m128 b)
- {
- const __m256 foo = _mm256_castps128_ps256(a);
- m256 = _mm256_insertf128_ps(foo, b, 1);
- }
-
- __forceinline const float &operator[](const size_t i) const
- {
- assert(i < 8);
- return f[i];
- }
- __forceinline float &operator[](const size_t i)
- {
- assert(i < 8);
- return f[i];
- }
-};
-
-__forceinline avxf cross(const avxf &a, const avxf &b)
-{
- avxf r(0.0,
- a[4] * b[5] - a[5] * b[4],
- a[6] * b[4] - a[4] * b[6],
- a[5] * b[6] - a[6] * b[5],
- 0.0,
- a[0] * b[1] - a[1] * b[0],
- a[2] * b[0] - a[0] * b[2],
- a[1] * b[2] - a[2] * b[1]);
- return r;
-}
-
-__forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2)
-{
- const avxf t = _mm256_mul_ps(a.m256, b.m256);
- den = ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2];
- den2 = ((float *)&t)[4] + ((float *)&t)[5] + ((float *)&t)[6];
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf cast(const __m256i &a)
-{
- return _mm256_castsi256_ps(a);
-}
-
-__forceinline const avxf mm256_sqrt(const avxf &a)
-{
- return _mm256_sqrt_ps(a.m256);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf operator+(const avxf &a, const avxf &b)
-{
- return _mm256_add_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator+(const avxf &a, const float &b)
-{
- return a + avxf(b);
-}
-__forceinline const avxf operator+(const float &a, const avxf &b)
-{
- return avxf(a) + b;
-}
-
-__forceinline const avxf operator-(const avxf &a, const avxf &b)
-{
- return _mm256_sub_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator-(const avxf &a, const float &b)
-{
- return a - avxf(b);
-}
-__forceinline const avxf operator-(const float &a, const avxf &b)
-{
- return avxf(a) - b;
-}
-
-__forceinline const avxf operator*(const avxf &a, const avxf &b)
-{
- return _mm256_mul_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator*(const avxf &a, const float &b)
-{
- return a * avxf(b);
-}
-__forceinline const avxf operator*(const float &a, const avxf &b)
-{
- return avxf(a) * b;
-}
-
-__forceinline const avxf operator/(const avxf &a, const avxf &b)
-{
- return _mm256_div_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator/(const avxf &a, const float &b)
-{
- return a / avxf(b);
-}
-__forceinline const avxf operator/(const float &a, const avxf &b)
-{
- return avxf(a) / b;
-}
-
-__forceinline const avxf operator|(const avxf &a, const avxf &b)
-{
- return _mm256_or_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf operator^(const avxf &a, const avxf &b)
-{
- return _mm256_xor_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf operator&(const avxf &a, const avxf &b)
-{
- return _mm256_and_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf max(const avxf &a, const avxf &b)
-{
- return _mm256_max_ps(a.m256, b.m256);
-}
-__forceinline const avxf min(const avxf &a, const avxf &b)
-{
- return _mm256_min_ps(a.m256, b.m256);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf shuffle(const avxf &a, const __m256i &shuf)
-{
- return _mm256_permutevar_ps(a, shuf);
-}
-
-template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-__forceinline const avxf shuffle(const avxf &a)
-{
- return _mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxf shuffle(const avxf &a, const avxf &b)
-{
- return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-}
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxf shuffle(const avxf &a)
-{
- return shuffle<i0, i1, i2, i3>(a, a);
-}
-template<size_t i0> __forceinline const avxf shuffle(const avxf &a, const avxf &b)
-{
- return shuffle<i0, i0, i0, i0>(a, b);
-}
-template<size_t i0> __forceinline const avxf shuffle(const avxf &a)
-{
- return shuffle<i0>(a, a);
-}
-
-template<size_t i> __forceinline float extract(const avxf &a)
-{
- __m256 b = shuffle<i, i, i, i>(a).m256;
- return _mm256_cvtss_f32(b);
-}
-template<> __forceinline float extract<0>(const avxf &a)
-{
- return _mm256_cvtss_f32(a.m256);
-}
-
-__forceinline ssef low(const avxf &a)
-{
- return _mm256_extractf128_ps(a.m256, 0);
-}
-__forceinline ssef high(const avxf &a)
-{
- return _mm256_extractf128_ps(a.m256, 1);
-}
-
-template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-__forceinline const avxf permute(const avxf &a)
-{
-#ifdef __KERNEL_AVX2__
- return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-#else
- float temp[8];
- _mm256_storeu_ps((float *)&temp, a);
- return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
-#endif
-}
-
-template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7>
-ccl_device_inline const avxf set_sign_bit(const avxf &a)
-{
- return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31, S2 << 31, S1 << 31, S0 << 31);
-}
-
-template<size_t S0, size_t S1, size_t S2, size_t S3, size_t S4, size_t S5, size_t S6, size_t S7>
-ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
-{
- return _mm256_blend_ps(
- a, b, S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
-}
-
-template<size_t S0, size_t S1, size_t S2, size_t S3>
-ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
-{
- return blend<S0, S1, S2, S3, S0, S1, S2, S3>(a, b);
-}
-
-//#if defined(__KERNEL_SSE41__)
-__forceinline avxf maxi(const avxf &a, const avxf &b)
-{
- const avxf ci = _mm256_max_ps(a, b);
- return ci;
-}
-
-__forceinline avxf mini(const avxf &a, const avxf &b)
-{
- const avxf ci = _mm256_min_ps(a, b);
- return ci;
-}
-//#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Ternary Operators
-////////////////////////////////////////////////////////////////////////////////
-__forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
- return _mm256_fmadd_ps(a, b, c);
-#else
- return c + (a * b);
-#endif
-}
-
-__forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
- return _mm256_fnmadd_ps(a, b, c);
-#else
- return c - (a * b);
-#endif
-}
-__forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
- return _mm256_fmsub_ps(a, b, c);
-#else
- return (a * b) - c;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-__forceinline const avxb operator<=(const avxf &a, const avxf &b)
-{
- return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
-}
-
-__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f)
-{
- return _mm256_blendv_ps(f, t, m);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Common Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t)
-{
- return madd(t, b, (avxf(1.0f) - t) * a);
-}
-
-#ifndef _mm256_set_m128
-# define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
- _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
-#endif
-
-#define _mm256_loadu2_m128(/* float const* */ hiaddr, /* float const* */ loaddr) \
- _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/avxi.h b/intern/cycles/util/avxi.h
deleted file mode 100644
index 966a04a6b97..00000000000
--- a/intern/cycles/util/avxi.h
+++ /dev/null
@@ -1,732 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2009-2013 Intel Corporation */
-
-#ifndef __UTIL_AVXI_H__
-#define __UTIL_AVXI_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxb;
-
-struct avxi {
- typedef avxb Mask; // mask type for us
- enum { size = 8 }; // number of SIMD elements
- union { // data
- __m256i m256;
-#if !defined(__KERNEL_AVX2__)
- struct {
- __m128i l, h;
- };
-#endif
- int32_t v[8];
- };
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Constructors, Assignment & Cast Operators
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline avxi()
- {
- }
- __forceinline avxi(const avxi &a)
- {
- m256 = a.m256;
- }
- __forceinline avxi &operator=(const avxi &a)
- {
- m256 = a.m256;
- return *this;
- }
-
- __forceinline avxi(const __m256i a) : m256(a)
- {
- }
- __forceinline operator const __m256i &(void) const
- {
- return m256;
- }
- __forceinline operator __m256i &(void)
- {
- return m256;
- }
-
- __forceinline explicit avxi(const ssei &a)
- : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1))
- {
- }
- __forceinline avxi(const ssei &a, const ssei &b)
- : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
- {
- }
-#if defined(__KERNEL_AVX2__)
- __forceinline avxi(const __m128i &a, const __m128i &b)
- : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
- {
- }
-#else
- __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b)
- {
- }
-#endif
- __forceinline explicit avxi(const int32_t *const a)
- : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a)))
- {
- }
- __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a))
- {
- }
- __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a))
- {
- }
- __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d)
- : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a))
- {
- }
- __forceinline avxi(
- int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h)
- : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a))
- {
- }
-
- __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a))
- {
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Constants
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256())
- {
- }
-#if defined(__KERNEL_AVX2__)
- __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1))
- {
- }
- __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf))
- {
- }
- __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf))
- {
- }
-#else
- __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1))
- {
- }
- __forceinline avxi(PosInfTy)
- : m256(_mm256_set_epi32(
- pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf))
- {
- }
- __forceinline avxi(NegInfTy)
- : m256(_mm256_set_epi32(
- neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf))
- {
- }
-#endif
- __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))
- {
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Array Access
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline const int32_t &operator[](const size_t i) const
- {
- assert(i < 8);
- return v[i];
- }
- __forceinline int32_t &operator[](const size_t i)
- {
- assert(i < 8);
- return v[i];
- }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxi cast(const __m256 &a)
-{
- return _mm256_castps_si256(a);
-}
-__forceinline const avxi operator+(const avxi &a)
-{
- return a;
-}
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator-(const avxi &a)
-{
- return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256);
-}
-__forceinline const avxi abs(const avxi &a)
-{
- return _mm256_abs_epi32(a.m256);
-}
-#else
-__forceinline const avxi operator-(const avxi &a)
-{
- return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h));
-}
-__forceinline const avxi abs(const avxi &a)
-{
- return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h));
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator+(const avxi &a, const avxi &b)
-{
- return _mm256_add_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator+(const avxi &a, const avxi &b)
-{
- return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator+(const avxi &a, const int32_t b)
-{
- return a + avxi(b);
-}
-__forceinline const avxi operator+(const int32_t a, const avxi &b)
-{
- return avxi(a) + b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator-(const avxi &a, const avxi &b)
-{
- return _mm256_sub_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator-(const avxi &a, const avxi &b)
-{
- return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator-(const avxi &a, const int32_t b)
-{
- return a - avxi(b);
-}
-__forceinline const avxi operator-(const int32_t a, const avxi &b)
-{
- return avxi(a) - b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator*(const avxi &a, const avxi &b)
-{
- return _mm256_mullo_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator*(const avxi &a, const avxi &b)
-{
- return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator*(const avxi &a, const int32_t b)
-{
- return a * avxi(b);
-}
-__forceinline const avxi operator*(const int32_t a, const avxi &b)
-{
- return avxi(a) * b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator&(const avxi &a, const avxi &b)
-{
- return _mm256_and_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator&(const avxi &a, const avxi &b)
-{
- return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator&(const avxi &a, const int32_t b)
-{
- return a & avxi(b);
-}
-__forceinline const avxi operator&(const int32_t a, const avxi &b)
-{
- return avxi(a) & b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator|(const avxi &a, const avxi &b)
-{
- return _mm256_or_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator|(const avxi &a, const avxi &b)
-{
- return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator|(const avxi &a, const int32_t b)
-{
- return a | avxi(b);
-}
-__forceinline const avxi operator|(const int32_t a, const avxi &b)
-{
- return avxi(a) | b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator^(const avxi &a, const avxi &b)
-{
- return _mm256_xor_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator^(const avxi &a, const avxi &b)
-{
- return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator^(const avxi &a, const int32_t b)
-{
- return a ^ avxi(b);
-}
-__forceinline const avxi operator^(const int32_t a, const avxi &b)
-{
- return avxi(a) ^ b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator<<(const avxi &a, const int32_t n)
-{
- return _mm256_slli_epi32(a.m256, n);
-}
-__forceinline const avxi operator>>(const avxi &a, const int32_t n)
-{
- return _mm256_srai_epi32(a.m256, n);
-}
-
-__forceinline const avxi sra(const avxi &a, const int32_t b)
-{
- return _mm256_srai_epi32(a.m256, b);
-}
-__forceinline const avxi srl(const avxi &a, const int32_t b)
-{
- return _mm256_srli_epi32(a.m256, b);
-}
-#else
-__forceinline const avxi operator<<(const avxi &a, const int32_t n)
-{
- return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n));
-}
-__forceinline const avxi operator>>(const avxi &a, const int32_t n)
-{
- return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n));
-}
-
-__forceinline const avxi sra(const avxi &a, const int32_t b)
-{
- return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b));
-}
-__forceinline const avxi srl(const avxi &a, const int32_t b)
-{
- return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b));
-}
-#endif
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi min(const avxi &a, const avxi &b)
-{
- return _mm256_min_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi min(const avxi &a, const avxi &b)
-{
- return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi min(const avxi &a, const int32_t b)
-{
- return min(a, avxi(b));
-}
-__forceinline const avxi min(const int32_t a, const avxi &b)
-{
- return min(avxi(a), b);
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi max(const avxi &a, const avxi &b)
-{
- return _mm256_max_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi max(const avxi &a, const avxi &b)
-{
- return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi max(const avxi &a, const int32_t b)
-{
- return max(a, avxi(b));
-}
-__forceinline const avxi max(const int32_t a, const avxi &b)
-{
- return max(avxi(a), b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline avxi &operator+=(avxi &a, const avxi &b)
-{
- return a = a + b;
-}
-__forceinline avxi &operator+=(avxi &a, const int32_t b)
-{
- return a = a + b;
-}
-
-__forceinline avxi &operator-=(avxi &a, const avxi &b)
-{
- return a = a - b;
-}
-__forceinline avxi &operator-=(avxi &a, const int32_t b)
-{
- return a = a - b;
-}
-
-__forceinline avxi &operator*=(avxi &a, const avxi &b)
-{
- return a = a * b;
-}
-__forceinline avxi &operator*=(avxi &a, const int32_t b)
-{
- return a = a * b;
-}
-
-__forceinline avxi &operator&=(avxi &a, const avxi &b)
-{
- return a = a & b;
-}
-__forceinline avxi &operator&=(avxi &a, const int32_t b)
-{
- return a = a & b;
-}
-
-__forceinline avxi &operator|=(avxi &a, const avxi &b)
-{
- return a = a | b;
-}
-__forceinline avxi &operator|=(avxi &a, const int32_t b)
-{
- return a = a | b;
-}
-
-__forceinline avxi &operator^=(avxi &a, const avxi &b)
-{
- return a = a ^ b;
-}
-__forceinline avxi &operator^=(avxi &a, const int32_t b)
-{
- return a = a ^ b;
-}
-
-__forceinline avxi &operator<<=(avxi &a, const int32_t b)
-{
- return a = a << b;
-}
-__forceinline avxi &operator>>=(avxi &a, const int32_t b)
-{
- return a = a >> b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator==(const avxi &a, const avxi &b)
-{
- return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256));
-}
-#else
-__forceinline const avxb operator==(const avxi &a, const avxi &b)
-{
- return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)),
- _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator==(const avxi &a, const int32_t b)
-{
- return a == avxi(b);
-}
-__forceinline const avxb operator==(const int32_t a, const avxi &b)
-{
- return avxi(a) == b;
-}
-
-__forceinline const avxb operator!=(const avxi &a, const avxi &b)
-{
- return !(a == b);
-}
-__forceinline const avxb operator!=(const avxi &a, const int32_t b)
-{
- return a != avxi(b);
-}
-__forceinline const avxb operator!=(const int32_t a, const avxi &b)
-{
- return avxi(a) != b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator<(const avxi &a, const avxi &b)
-{
- return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256));
-}
-#else
-__forceinline const avxb operator<(const avxi &a, const avxi &b)
-{
- return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)),
- _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator<(const avxi &a, const int32_t b)
-{
- return a < avxi(b);
-}
-__forceinline const avxb operator<(const int32_t a, const avxi &b)
-{
- return avxi(a) < b;
-}
-
-__forceinline const avxb operator>=(const avxi &a, const avxi &b)
-{
- return !(a < b);
-}
-__forceinline const avxb operator>=(const avxi &a, const int32_t b)
-{
- return a >= avxi(b);
-}
-__forceinline const avxb operator>=(const int32_t a, const avxi &b)
-{
- return avxi(a) >= b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator>(const avxi &a, const avxi &b)
-{
- return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256));
-}
-#else
-__forceinline const avxb operator>(const avxi &a, const avxi &b)
-{
- return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)),
- _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator>(const avxi &a, const int32_t b)
-{
- return a > avxi(b);
-}
-__forceinline const avxb operator>(const int32_t a, const avxi &b)
-{
- return avxi(a) > b;
-}
-
-__forceinline const avxb operator<=(const avxi &a, const avxi &b)
-{
- return !(a > b);
-}
-__forceinline const avxb operator<=(const avxi &a, const int32_t b)
-{
- return a <= avxi(b);
-}
-__forceinline const avxb operator<=(const int32_t a, const avxi &b)
-{
- return avxi(a) <= b;
-}
-
-__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f)
-{
- return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline avxi unpacklo(const avxi &a, const avxi &b)
-{
- return _mm256_unpacklo_epi32(a.m256, b.m256);
-}
-__forceinline avxi unpackhi(const avxi &a, const avxi &b)
-{
- return _mm256_unpackhi_epi32(a.m256, b.m256);
-}
-#else
-__forceinline avxi unpacklo(const avxi &a, const avxi &b)
-{
- return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-__forceinline avxi unpackhi(const avxi &a, const avxi &b)
-{
- return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-
-template<size_t i> __forceinline const avxi shuffle(const avxi &a)
-{
- return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i)));
-}
-
-template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a)
-{
- return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0));
-}
-
-template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b)
-{
- return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxi shuffle(const avxi &a)
-{
- return _mm256_castps_si256(
- _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0)));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxi shuffle(const avxi &a, const avxi &b)
-{
- return _mm256_castps_si256(_mm256_shuffle_ps(
- _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-}
-
-template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b)
-{
- return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b)));
-}
-template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b)
-{
- return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b)));
-}
-template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b)
-{
- return _mm256_castps_si256(
- _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b)))));
-}
-
-__forceinline const avxi broadcast(const int *ptr)
-{
- return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr));
-}
-template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b)
-{
- return _mm256_insertf128_si256(a, b, i);
-}
-template<size_t i> __forceinline const ssei extract(const avxi &a)
-{
- return _mm256_extractf128_si256(a, i);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxi vreduce_min2(const avxi &v)
-{
- return min(v, shuffle<1, 0, 3, 2>(v));
-}
-__forceinline const avxi vreduce_min4(const avxi &v)
-{
- avxi v1 = vreduce_min2(v);
- return min(v1, shuffle<2, 3, 0, 1>(v1));
-}
-__forceinline const avxi vreduce_min(const avxi &v)
-{
- avxi v1 = vreduce_min4(v);
- return min(v1, shuffle<1, 0>(v1));
-}
-
-__forceinline const avxi vreduce_max2(const avxi &v)
-{
- return max(v, shuffle<1, 0, 3, 2>(v));
-}
-__forceinline const avxi vreduce_max4(const avxi &v)
-{
- avxi v1 = vreduce_max2(v);
- return max(v1, shuffle<2, 3, 0, 1>(v1));
-}
-__forceinline const avxi vreduce_max(const avxi &v)
-{
- avxi v1 = vreduce_max4(v);
- return max(v1, shuffle<1, 0>(v1));
-}
-
-__forceinline const avxi vreduce_add2(const avxi &v)
-{
- return v + shuffle<1, 0, 3, 2>(v);
-}
-__forceinline const avxi vreduce_add4(const avxi &v)
-{
- avxi v1 = vreduce_add2(v);
- return v1 + shuffle<2, 3, 0, 1>(v1);
-}
-__forceinline const avxi vreduce_add(const avxi &v)
-{
- avxi v1 = vreduce_add4(v);
- return v1 + shuffle<1, 0>(v1);
-}
-
-__forceinline int reduce_min(const avxi &v)
-{
- return extract<0>(extract<0>(vreduce_min(v)));
-}
-__forceinline int reduce_max(const avxi &v)
-{
- return extract<0>(extract<0>(vreduce_max(v)));
-}
-__forceinline int reduce_add(const avxi &v)
-{
- return extract<0>(extract<0>(vreduce_add(v)));
-}
-
-__forceinline uint32_t select_min(const avxi &v)
-{
- return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const avxi &v)
-{
- return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const avxb &valid, const avxi &v)
-{
- const avxi a = select(valid, v, avxi(pos_inf));
- return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const avxb &valid, const avxi &v)
-{
- const avxi a = select(valid, v, avxi(neg_inf));
- return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Output Operators
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_avxi(const char *label, const avxi &a)
-{
- printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/color.h b/intern/cycles/util/color.h
index 537f8ab6771..93e984120f2 100644
--- a/intern/cycles/util/color.h
+++ b/intern/cycles/util/color.h
@@ -228,28 +228,27 @@ ccl_device float3 xyY_to_xyz(float x, float y, float Y)
* exp = exponent, encoded as uint32_t
* e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t
*/
-template<unsigned exp, unsigned e2coeff> ccl_device_inline ssef fastpow(const ssef &arg)
+template<unsigned exp, unsigned e2coeff> ccl_device_inline float4 fastpow(const float4 &arg)
{
- ssef ret;
- ret = arg * cast(ssei(e2coeff));
- ret = ssef(cast(ret));
- ret = ret * cast(ssei(exp));
- ret = cast(ssei(ret));
+ float4 ret = arg * cast(make_int4(e2coeff));
+ ret = make_float4(cast(ret));
+ ret = ret * cast(make_int4(exp));
+ ret = cast(make_int4(ret));
return ret;
}
/* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
-ccl_device_inline ssef improve_5throot_solution(const ssef &old_result, const ssef &x)
+ccl_device_inline float4 improve_5throot_solution(const float4 &old_result, const float4 &x)
{
- ssef approx2 = old_result * old_result;
- ssef approx4 = approx2 * approx2;
- ssef t = x / approx4;
- ssef summ = madd(ssef(4.0f), old_result, t);
- return summ * ssef(1.0f / 5.0f);
+ float4 approx2 = old_result * old_result;
+ float4 approx4 = approx2 * approx2;
+ float4 t = x / approx4;
+ float4 summ = madd(make_float4(4.0f), old_result, t);
+ return summ * make_float4(1.0f / 5.0f);
}
/* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
-ccl_device_inline ssef fastpow24(const ssef &arg)
+ccl_device_inline float4 fastpow24(const float4 &arg)
{
/* max, avg and |avg| errors were calculated in gcc without FMA instructions
* The final precision should be better than powf in glibc */
@@ -257,9 +256,10 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
/* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */
/* 0x3F4CCCCD = 4/5 */
/* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
- ssef x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg); // error max = 0.17 avg = 0.0018 |avg| = 0.05
- ssef arg2 = arg * arg;
- ssef arg4 = arg2 * arg2;
+ float4 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(
+ arg); // error max = 0.17 avg = 0.0018 |avg| = 0.05
+ float4 arg2 = arg * arg;
+ float4 arg4 = arg2 * arg2;
/* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */
x = improve_5throot_solution(x, arg4);
@@ -271,12 +271,12 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
return x * (x * x);
}
-ccl_device ssef color_srgb_to_linear(const ssef &c)
+ccl_device float4 color_srgb_to_linear(const float4 &c)
{
- sseb cmp = c < ssef(0.04045f);
- ssef lt = max(c * ssef(1.0f / 12.92f), ssef(0.0f));
- ssef gtebase = (c + ssef(0.055f)) * ssef(1.0f / 1.055f); /* fma */
- ssef gte = fastpow24(gtebase);
+ int4 cmp = c < make_float4(0.04045f);
+ float4 lt = max(c * make_float4(1.0f / 12.92f), make_float4(0.0f));
+ float4 gtebase = (c + make_float4(0.055f)) * make_float4(1.0f / 1.055f); /* fma */
+ float4 gte = fastpow24(gtebase);
return select(cmp, lt, gte);
}
#endif /* __KERNEL_SSE2__ */
@@ -302,10 +302,8 @@ ccl_device float4 color_linear_to_srgb_v4(float4 c)
ccl_device float4 color_srgb_to_linear_v4(float4 c)
{
#ifdef __KERNEL_SSE2__
- ssef r_ssef;
- float4 &r = (float4 &)r_ssef;
- r = c;
- r_ssef = color_srgb_to_linear(r_ssef);
+ float4 r = c;
+ r = color_srgb_to_linear(r);
r.w = c.w;
return r;
#else
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index 1969529eff0..d5be14c8eba 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -23,6 +23,7 @@
/* Leave inlining decisions to compiler for these, the inline keyword here
* is not about performance but including function definitions in headers. */
# define ccl_device static inline
+# define ccl_device_extern extern "C"
# define ccl_device_noinline static inline
# define ccl_device_noinline_cpu ccl_device_noinline
diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h
index c668638eb02..5665dd4c075 100644
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -154,17 +154,17 @@ ccl_device_inline half float_to_half_display(const float f)
ccl_device_inline half4 float4_to_half4_display(const float4 f)
{
-#ifdef __KERNEL_SSE2__
+#ifdef __KERNEL_SSE__
/* CPU: SSE and AVX. */
- ssef x = min(max(load4f(f), 0.0f), 65504.0f);
+ float4 x = min(max(f, make_float4(0.0f)), make_float4(65504.0f));
# ifdef __KERNEL_AVX2__
- ssei rpack = _mm_cvtps_ph(x, 0);
+ int4 rpack = int4(_mm_cvtps_ph(x, 0));
# else
- ssei absolute = cast(x) & 0x7FFFFFFF;
- ssei Z = absolute + 0xC8000000;
- ssei result = andnot(absolute < 0x38800000, Z);
- ssei rshift = (result >> 13) & 0x7FFF;
- ssei rpack = _mm_packs_epi32(rshift, rshift);
+ int4 absolute = cast(x) & make_int4(0x7FFFFFFF);
+ int4 Z = absolute + make_int4(0xC8000000);
+ int4 result = andnot(absolute < make_int4(0x38800000), Z);
+ int4 rshift = (result >> 13) & make_int4(0x7FFF);
+ int4 rpack = int4(_mm_packs_epi32(rshift, rshift));
# endif
half4 h;
_mm_storel_pi((__m64 *)&h, _mm_castsi128_ps(rpack));
diff --git a/intern/cycles/util/hash.h b/intern/cycles/util/hash.h
index 4f83f331229..74210ff020e 100644
--- a/intern/cycles/util/hash.h
+++ b/intern/cycles/util/hash.h
@@ -222,7 +222,7 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k)
/* SSE Versions Of Jenkins Lookup3 Hash Functions */
-#ifdef __KERNEL_SSE2__
+#ifdef __KERNEL_SSE__
# define rot(x, k) (((x) << (k)) | (srl(x, 32 - (k))))
# define mix(a, b, c) \
@@ -265,10 +265,10 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k)
c -= rot(b, 24); \
}
-ccl_device_inline ssei hash_ssei(ssei kx)
+ccl_device_inline int4 hash_int4(int4 kx)
{
- ssei a, b, c;
- a = b = c = ssei(0xdeadbeef + (1 << 2) + 13);
+ int4 a, b, c;
+ a = b = c = make_int4(0xdeadbeef + (1 << 2) + 13);
a += kx;
final(a, b, c);
@@ -276,10 +276,10 @@ ccl_device_inline ssei hash_ssei(ssei kx)
return c;
}
-ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
+ccl_device_inline int4 hash_int4_2(int4 kx, int4 ky)
{
- ssei a, b, c;
- a = b = c = ssei(0xdeadbeef + (2 << 2) + 13);
+ int4 a, b, c;
+ a = b = c = make_int4(0xdeadbeef + (2 << 2) + 13);
b += ky;
a += kx;
@@ -288,10 +288,10 @@ ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
return c;
}
-ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
+ccl_device_inline int4 hash_int4_3(int4 kx, int4 ky, int4 kz)
{
- ssei a, b, c;
- a = b = c = ssei(0xdeadbeef + (3 << 2) + 13);
+ int4 a, b, c;
+ a = b = c = make_int4(0xdeadbeef + (3 << 2) + 13);
c += kz;
b += ky;
@@ -301,10 +301,10 @@ ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
return c;
}
-ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
+ccl_device_inline int4 hash_int4_4(int4 kx, int4 ky, int4 kz, int4 kw)
{
- ssei a, b, c;
- a = b = c = ssei(0xdeadbeef + (4 << 2) + 13);
+ int4 a, b, c;
+ a = b = c = make_int4(0xdeadbeef + (4 << 2) + 13);
a += kx;
b += ky;
@@ -317,11 +317,11 @@ ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
return c;
}
-# if defined(__KERNEL_AVX__)
-ccl_device_inline avxi hash_avxi(avxi kx)
+# if defined(__KERNEL_AVX2__)
+ccl_device_inline vint8 hash_int8(vint8 kx)
{
- avxi a, b, c;
- a = b = c = avxi(0xdeadbeef + (1 << 2) + 13);
+ vint8 a, b, c;
+ a = b = c = make_vint8(0xdeadbeef + (1 << 2) + 13);
a += kx;
final(a, b, c);
@@ -329,10 +329,10 @@ ccl_device_inline avxi hash_avxi(avxi kx)
return c;
}
-ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
+ccl_device_inline vint8 hash_int8_2(vint8 kx, vint8 ky)
{
- avxi a, b, c;
- a = b = c = avxi(0xdeadbeef + (2 << 2) + 13);
+ vint8 a, b, c;
+ a = b = c = make_vint8(0xdeadbeef + (2 << 2) + 13);
b += ky;
a += kx;
@@ -341,10 +341,10 @@ ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
return c;
}
-ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
+ccl_device_inline vint8 hash_int8_3(vint8 kx, vint8 ky, vint8 kz)
{
- avxi a, b, c;
- a = b = c = avxi(0xdeadbeef + (3 << 2) + 13);
+ vint8 a, b, c;
+ a = b = c = make_vint8(0xdeadbeef + (3 << 2) + 13);
c += kz;
b += ky;
@@ -354,10 +354,10 @@ ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
return c;
}
-ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
+ccl_device_inline vint8 hash_int8_4(vint8 kx, vint8 ky, vint8 kz, vint8 kw)
{
- avxi a, b, c;
- a = b = c = avxi(0xdeadbeef + (4 << 2) + 13);
+ vint8 a, b, c;
+ a = b = c = make_vint8(0xdeadbeef + (4 << 2) + 13);
a += kx;
b += ky;
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index 3a2e0e074a2..0fbe7a67a4f 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -532,12 +532,14 @@ CCL_NAMESPACE_END
#include "util/math_int2.h"
#include "util/math_int3.h"
#include "util/math_int4.h"
+#include "util/math_int8.h"
#include "util/math_float2.h"
-#include "util/math_float3.h"
#include "util/math_float4.h"
#include "util/math_float8.h"
+#include "util/math_float3.h"
+
#include "util/rect.h"
CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h
index 542dad93467..ad806d0f08a 100644
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -10,55 +10,6 @@
CCL_NAMESPACE_BEGIN
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float2 operator-(const float2 &a);
-ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator*(const float2 &a, float f);
-ccl_device_inline float2 operator*(float f, const float2 &a);
-ccl_device_inline float2 operator/(float f, const float2 &a);
-ccl_device_inline float2 operator/(const float2 &a, float f);
-ccl_device_inline float2 operator/(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator+(const float2 &a, const float f);
-ccl_device_inline float2 operator+(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator-(const float2 &a, const float f);
-ccl_device_inline float2 operator-(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator+=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator*=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator*=(float2 &a, float f);
-ccl_device_inline float2 operator/=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator/=(float2 &a, float f);
-
-ccl_device_inline bool operator==(const float2 &a, const float2 &b);
-ccl_device_inline bool operator!=(const float2 &a, const float2 &b);
-
-ccl_device_inline bool is_zero(const float2 &a);
-ccl_device_inline float average(const float2 &a);
-ccl_device_inline float distance(const float2 &a, const float2 &b);
-ccl_device_inline float dot(const float2 &a, const float2 &b);
-ccl_device_inline float cross(const float2 &a, const float2 &b);
-ccl_device_inline float len(const float2 a);
-ccl_device_inline float2 normalize(const float2 &a);
-ccl_device_inline float2 normalize_len(const float2 &a, float *t);
-ccl_device_inline float2 safe_normalize(const float2 &a);
-ccl_device_inline float2 min(const float2 &a, const float2 &b);
-ccl_device_inline float2 max(const float2 &a, const float2 &b);
-ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx);
-ccl_device_inline float2 fabs(const float2 &a);
-ccl_device_inline float2 as_float2(const float4 &a);
-ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
-ccl_device_inline float2 floor(const float2 &a);
-#endif /* !__KERNEL_METAL__ */
-
-ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
-
-/*******************************************************************************
- * Definition.
- */
-
ccl_device_inline float2 zero_float2()
{
return make_float2(0.0f, 0.0f);
@@ -75,63 +26,63 @@ ccl_device_inline float2 operator-(const float2 &a)
return make_float2(-a.x, -a.y);
}
-ccl_device_inline float2 operator*(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator*(const float2 a, const float2 b)
{
return make_float2(a.x * b.x, a.y * b.y);
}
-ccl_device_inline float2 operator*(const float2 &a, float f)
+ccl_device_inline float2 operator*(const float2 a, float f)
{
return make_float2(a.x * f, a.y * f);
}
-ccl_device_inline float2 operator*(float f, const float2 &a)
+ccl_device_inline float2 operator*(float f, const float2 a)
{
return make_float2(a.x * f, a.y * f);
}
-ccl_device_inline float2 operator/(float f, const float2 &a)
+ccl_device_inline float2 operator/(float f, const float2 a)
{
return make_float2(f / a.x, f / a.y);
}
-ccl_device_inline float2 operator/(const float2 &a, float f)
+ccl_device_inline float2 operator/(const float2 a, float f)
{
float invf = 1.0f / f;
return make_float2(a.x * invf, a.y * invf);
}
-ccl_device_inline float2 operator/(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator/(const float2 a, const float2 b)
{
return make_float2(a.x / b.x, a.y / b.y);
}
-ccl_device_inline float2 operator+(const float2 &a, const float f)
+ccl_device_inline float2 operator+(const float2 a, const float2 b)
{
- return a + make_float2(f, f);
+ return make_float2(a.x + b.x, a.y + b.y);
}
-ccl_device_inline float2 operator+(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator+(const float2 a, const float f)
{
- return make_float2(a.x + b.x, a.y + b.y);
+ return a + make_float2(f, f);
}
-ccl_device_inline float2 operator-(const float2 &a, const float f)
+ccl_device_inline float2 operator-(const float2 a, const float2 b)
{
- return a - make_float2(f, f);
+ return make_float2(a.x - b.x, a.y - b.y);
}
-ccl_device_inline float2 operator-(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator-(const float2 a, const float f)
{
- return make_float2(a.x - b.x, a.y - b.y);
+ return a - make_float2(f, f);
}
-ccl_device_inline float2 operator+=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator+=(float2 &a, const float2 b)
{
return a = a + b;
}
-ccl_device_inline float2 operator*=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator*=(float2 &a, const float2 b)
{
return a = a * b;
}
@@ -141,7 +92,7 @@ ccl_device_inline float2 operator*=(float2 &a, float f)
return a = a * f;
}
-ccl_device_inline float2 operator/=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator/=(float2 &a, const float2 b)
{
return a = a / b;
}
@@ -152,74 +103,81 @@ ccl_device_inline float2 operator/=(float2 &a, float f)
return a = a * invf;
}
-ccl_device_inline bool operator==(const float2 &a, const float2 &b)
+ccl_device_inline bool operator==(const float2 a, const float2 b)
{
return (a.x == b.x && a.y == b.y);
}
-ccl_device_inline bool operator!=(const float2 &a, const float2 &b)
+ccl_device_inline bool operator!=(const float2 a, const float2 b)
{
return !(a == b);
}
-ccl_device_inline bool is_zero(const float2 &a)
+ccl_device_inline bool is_zero(const float2 a)
{
return (a.x == 0.0f && a.y == 0.0f);
}
-ccl_device_inline float average(const float2 &a)
+ccl_device_inline float average(const float2 a)
{
return (a.x + a.y) * (1.0f / 2.0f);
}
-ccl_device_inline float distance(const float2 &a, const float2 &b)
+ccl_device_inline float dot(const float2 a, const float2 b)
{
- return len(a - b);
+ return a.x * b.x + a.y * b.y;
}
+#endif
-ccl_device_inline float dot(const float2 &a, const float2 &b)
+ccl_device_inline float len(const float2 a)
{
- return a.x * b.x + a.y * b.y;
+ return sqrtf(dot(a, a));
}
-ccl_device_inline float cross(const float2 &a, const float2 &b)
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float2 a, const float2 b)
+{
+ return len(a - b);
+}
+
+ccl_device_inline float cross(const float2 a, const float2 b)
{
return (a.x * b.y - a.y * b.x);
}
-ccl_device_inline float2 normalize(const float2 &a)
+ccl_device_inline float2 normalize(const float2 a)
{
return a / len(a);
}
-ccl_device_inline float2 normalize_len(const float2 &a, ccl_private float *t)
+ccl_device_inline float2 normalize_len(const float2 a, ccl_private float *t)
{
*t = len(a);
return a / (*t);
}
-ccl_device_inline float2 safe_normalize(const float2 &a)
+ccl_device_inline float2 safe_normalize(const float2 a)
{
float t = len(a);
return (t != 0.0f) ? a / t : a;
}
-ccl_device_inline float2 min(const float2 &a, const float2 &b)
+ccl_device_inline float2 min(const float2 a, const float2 b)
{
return make_float2(min(a.x, b.x), min(a.y, b.y));
}
-ccl_device_inline float2 max(const float2 &a, const float2 &b)
+ccl_device_inline float2 max(const float2 a, const float2 b)
{
return make_float2(max(a.x, b.x), max(a.y, b.y));
}
-ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx)
+ccl_device_inline float2 clamp(const float2 a, const float2 mn, const float2 mx)
{
return min(max(a, mn), mx);
}
-ccl_device_inline float2 fabs(const float2 &a)
+ccl_device_inline float2 fabs(const float2 a)
{
return make_float2(fabsf(a.x), fabsf(a.y));
}
@@ -229,28 +187,23 @@ ccl_device_inline float2 as_float2(const float4 &a)
return make_float2(a.x, a.y);
}
-ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t)
+ccl_device_inline float2 interp(const float2 a, const float2 b, float t)
{
return a + t * (b - a);
}
-ccl_device_inline float2 mix(const float2 &a, const float2 &b, float t)
+ccl_device_inline float2 mix(const float2 a, const float2 b, float t)
{
return a + t * (b - a);
}
-ccl_device_inline float2 floor(const float2 &a)
+ccl_device_inline float2 floor(const float2 a)
{
return make_float2(floorf(a.x), floorf(a.y));
}
#endif /* !__KERNEL_METAL__ */
-ccl_device_inline float len(const float2 a)
-{
- return sqrtf(dot(a, a));
-}
-
ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
{
return (b != 0.0f) ? a / b : zero_float2();
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index eec7122b9dc..79ee86d9c82 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
* Copyright 2011-2022 Blender Foundation */
#ifndef __UTIL_MATH_FLOAT3_H__
@@ -10,73 +11,6 @@
CCL_NAMESPACE_BEGIN
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float3 operator-(const float3 &a);
-ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator*(const float3 &a, const float f);
-ccl_device_inline float3 operator*(const float f, const float3 &a);
-ccl_device_inline float3 operator/(const float f, const float3 &a);
-ccl_device_inline float3 operator/(const float3 &a, const float f);
-ccl_device_inline float3 operator/(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator+(const float3 &a, const float f);
-ccl_device_inline float3 operator+(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator-(const float3 &a, const float f);
-ccl_device_inline float3 operator-(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator+=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator-=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator*=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator*=(float3 &a, float f);
-ccl_device_inline float3 operator/=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator/=(float3 &a, float f);
-
-ccl_device_inline bool operator==(const float3 &a, const float3 &b);
-ccl_device_inline bool operator!=(const float3 &a, const float3 &b);
-
-ccl_device_inline float distance(const float3 &a, const float3 &b);
-ccl_device_inline float dot(const float3 &a, const float3 &b);
-ccl_device_inline float dot_xy(const float3 &a, const float3 &b);
-ccl_device_inline float3 cross(const float3 &a, const float3 &b);
-ccl_device_inline float3 normalize(const float3 &a);
-ccl_device_inline float3 min(const float3 &a, const float3 &b);
-ccl_device_inline float3 max(const float3 &a, const float3 &b);
-ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx);
-ccl_device_inline float3 fabs(const float3 &a);
-ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t);
-ccl_device_inline float3 rcp(const float3 &a);
-ccl_device_inline float3 sqrt(const float3 &a);
-ccl_device_inline float3 floor(const float3 &a);
-ccl_device_inline float3 ceil(const float3 &a);
-ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
-#endif /* !defined(__KERNEL_METAL__) */
-
-ccl_device_inline float reduce_min(float3 a);
-ccl_device_inline float reduce_max(float3 a);
-ccl_device_inline float len(const float3 a);
-ccl_device_inline float len_squared(const float3 a);
-
-ccl_device_inline float3 project(const float3 v, const float3 v_proj);
-
-ccl_device_inline float3 safe_normalize(const float3 a);
-ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_divide(const float3 a, const float3 b);
-ccl_device_inline float3 safe_divide(const float3 a, const float b);
-ccl_device_inline float3 interp(float3 a, float3 b, float t);
-ccl_device_inline float3 sqr(float3 a);
-
-ccl_device_inline bool is_zero(const float3 a);
-ccl_device_inline float reduce_add(const float3 a);
-ccl_device_inline float average(const float3 a);
-ccl_device_inline bool isequal(const float3 a, const float3 b);
-
-/*******************************************************************************
- * Definition.
- */
-
ccl_device_inline float3 zero_float3()
{
#ifdef __KERNEL_SSE__
@@ -109,7 +43,7 @@ ccl_device_inline float3 operator-(const float3 &a)
# endif
}
-ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator*(const float3 a, const float3 b)
{
# ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128, b.m128));
@@ -118,7 +52,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
# endif
}
-ccl_device_inline float3 operator*(const float3 &a, const float f)
+ccl_device_inline float3 operator*(const float3 a, const float f)
{
# ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
@@ -127,7 +61,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float f)
# endif
}
-ccl_device_inline float3 operator*(const float f, const float3 &a)
+ccl_device_inline float3 operator*(const float f, const float3 a)
{
# if defined(__KERNEL_SSE__)
return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
@@ -136,7 +70,7 @@ ccl_device_inline float3 operator*(const float f, const float3 &a)
# endif
}
-ccl_device_inline float3 operator/(const float f, const float3 &a)
+ccl_device_inline float3 operator/(const float f, const float3 a)
{
# if defined(__KERNEL_SSE__)
return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
@@ -145,7 +79,7 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)
# endif
}
-ccl_device_inline float3 operator/(const float3 &a, const float f)
+ccl_device_inline float3 operator/(const float3 a, const float f)
{
# if defined(__KERNEL_SSE__)
return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
@@ -154,7 +88,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
# endif
}
-ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator/(const float3 a, const float3 b)
{
# if defined(__KERNEL_SSE__)
return float3(_mm_div_ps(a.m128, b.m128));
@@ -163,12 +97,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
# endif
}
-ccl_device_inline float3 operator+(const float3 &a, const float f)
-{
- return a + make_float3(f, f, f);
-}
-
-ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator+(const float3 a, const float3 b)
{
# ifdef __KERNEL_SSE__
return float3(_mm_add_ps(a.m128, b.m128));
@@ -177,12 +106,12 @@ ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
# endif
}
-ccl_device_inline float3 operator-(const float3 &a, const float f)
+ccl_device_inline float3 operator+(const float3 a, const float f)
{
- return a - make_float3(f, f, f);
+ return a + make_float3(f, f, f);
}
-ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator-(const float3 a, const float3 b)
{
# ifdef __KERNEL_SSE__
return float3(_mm_sub_ps(a.m128, b.m128));
@@ -191,17 +120,22 @@ ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
# endif
}
-ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator-(const float3 a, const float f)
+{
+ return a - make_float3(f, f, f);
+}
+
+ccl_device_inline float3 operator+=(float3 &a, const float3 b)
{
return a = a + b;
}
-ccl_device_inline float3 operator-=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator-=(float3 &a, const float3 b)
{
return a = a - b;
}
-ccl_device_inline float3 operator*=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator*=(float3 &a, const float3 b)
{
return a = a * b;
}
@@ -211,7 +145,7 @@ ccl_device_inline float3 operator*=(float3 &a, float f)
return a = a * f;
}
-ccl_device_inline float3 operator/=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator/=(float3 &a, const float3 b)
{
return a = a / b;
}
@@ -223,7 +157,7 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
}
# if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__))
-ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b)
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 b)
{
a = float3(a) * b;
return a;
@@ -235,7 +169,7 @@ ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f)
return a;
}
-ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b)
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 b)
{
a = float3(a) / b;
return a;
@@ -248,7 +182,7 @@ ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f)
}
# endif
-ccl_device_inline bool operator==(const float3 &a, const float3 &b)
+ccl_device_inline bool operator==(const float3 a, const float3 b)
{
# ifdef __KERNEL_SSE__
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
@@ -257,17 +191,12 @@ ccl_device_inline bool operator==(const float3 &a, const float3 &b)
# endif
}
-ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
+ccl_device_inline bool operator!=(const float3 a, const float3 b)
{
return !(a == b);
}
-ccl_device_inline float distance(const float3 &a, const float3 &b)
-{
- return len(a - b);
-}
-
-ccl_device_inline float dot(const float3 &a, const float3 &b)
+ccl_device_inline float dot(const float3 a, const float3 b)
{
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
@@ -276,26 +205,62 @@ ccl_device_inline float dot(const float3 &a, const float3 &b)
# endif
}
-ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
+#endif
+
+ccl_device_inline float dot_xy(const float3 a, const float3 b)
{
-# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-# else
+#else
return a.x * b.x + a.y * b.y;
-# endif
+#endif
+}
+
+ccl_device_inline float len(const float3 a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
+#else
+ return sqrtf(dot(a, a));
+#endif
+}
+
+ccl_device_inline float reduce_min(float3 a)
+{
+ return min(min(a.x, a.y), a.z);
+}
+
+ccl_device_inline float reduce_max(float3 a)
+{
+ return max(max(a.x, a.y), a.z);
+}
+
+ccl_device_inline float len_squared(const float3 a)
+{
+ return dot(a, a);
+}
+
+#ifndef __KERNEL_METAL__
+
+ccl_device_inline float distance(const float3 a, const float3 b)
+{
+ return len(a - b);
}
-ccl_device_inline float3 cross(const float3 &a, const float3 &b)
+ccl_device_inline float3 cross(const float3 a, const float3 b)
{
# ifdef __KERNEL_SSE__
- return float3(shuffle<1, 2, 0, 3>(
- msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+ const float4 x = float4(a.m128);
+ const float4 y = shuffle<1, 2, 0, 3>(float4(b.m128));
+ const float4 z = float4(_mm_mul_ps(shuffle<1, 2, 0, 3>(float4(a.m128)), float4(b.m128)));
+
+ return float3(shuffle<1, 2, 0, 3>(msub(x, y, z)).m128);
# else
return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
# endif
}
-ccl_device_inline float3 normalize(const float3 &a)
+ccl_device_inline float3 normalize(const float3 a)
{
# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
@@ -305,7 +270,7 @@ ccl_device_inline float3 normalize(const float3 &a)
# endif
}
-ccl_device_inline float3 min(const float3 &a, const float3 &b)
+ccl_device_inline float3 min(const float3 a, const float3 b)
{
# ifdef __KERNEL_SSE__
return float3(_mm_min_ps(a.m128, b.m128));
@@ -314,7 +279,7 @@ ccl_device_inline float3 min(const float3 &a, const float3 &b)
# endif
}
-ccl_device_inline float3 max(const float3 &a, const float3 &b)
+ccl_device_inline float3 max(const float3 a, const float3 b)
{
# ifdef __KERNEL_SSE__
return float3(_mm_max_ps(a.m128, b.m128));
@@ -323,12 +288,12 @@ ccl_device_inline float3 max(const float3 &a, const float3 &b)
# endif
}
-ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
+ccl_device_inline float3 clamp(const float3 a, const float3 mn, const float3 mx)
{
return min(max(a, mn), mx);
}
-ccl_device_inline float3 fabs(const float3 &a)
+ccl_device_inline float3 fabs(const float3 a)
{
# ifdef __KERNEL_SSE__
# ifdef __KERNEL_NEON__
@@ -342,7 +307,7 @@ ccl_device_inline float3 fabs(const float3 &a)
# endif
}
-ccl_device_inline float3 sqrt(const float3 &a)
+ccl_device_inline float3 sqrt(const float3 a)
{
# ifdef __KERNEL_SSE__
return float3(_mm_sqrt_ps(a));
@@ -351,7 +316,7 @@ ccl_device_inline float3 sqrt(const float3 &a)
# endif
}
-ccl_device_inline float3 floor(const float3 &a)
+ccl_device_inline float3 floor(const float3 a)
{
# ifdef __KERNEL_SSE__
return float3(_mm_floor_ps(a));
@@ -360,7 +325,7 @@ ccl_device_inline float3 floor(const float3 &a)
# endif
}
-ccl_device_inline float3 ceil(const float3 &a)
+ccl_device_inline float3 ceil(const float3 a)
{
# ifdef __KERNEL_SSE__
return float3(_mm_ceil_ps(a));
@@ -369,12 +334,12 @@ ccl_device_inline float3 ceil(const float3 &a)
# endif
}
-ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
+ccl_device_inline float3 mix(const float3 a, const float3 b, float t)
{
return a + t * (b - a);
}
-ccl_device_inline float3 rcp(const float3 &a)
+ccl_device_inline float3 rcp(const float3 a)
{
# ifdef __KERNEL_SSE__
/* Don't use _mm_rcp_ps due to poor precision. */
@@ -399,33 +364,6 @@ ccl_device_inline float3 log(float3 v)
return make_float3(logf(v.x), logf(v.y), logf(v.z));
}
-#endif /* !__KERNEL_METAL__ */
-
-ccl_device_inline float reduce_min(float3 a)
-{
- return min(min(a.x, a.y), a.z);
-}
-
-ccl_device_inline float reduce_max(float3 a)
-{
- return max(max(a.x, a.y), a.z);
-}
-
-ccl_device_inline float len(const float3 a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
- return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
-#else
- return sqrtf(dot(a, a));
-#endif
-}
-
-ccl_device_inline float len_squared(const float3 a)
-{
- return dot(a, a);
-}
-
-#if !defined(__KERNEL_METAL__)
ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
{
float3 unit_normal = normalize(normal);
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index c2721873037..301d2d789c0 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
* Copyright 2011-2022 Blender Foundation */
#ifndef __UTIL_MATH_FLOAT4_H__
@@ -10,85 +11,6 @@
CCL_NAMESPACE_BEGIN
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float4 operator-(const float4 &a);
-ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator*(const float4 &a, float f);
-ccl_device_inline float4 operator*(float f, const float4 &a);
-ccl_device_inline float4 operator/(const float4 &a, float f);
-ccl_device_inline float4 operator/(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+(const float4 &a, const float f);
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator-(const float4 &a, const float f);
-ccl_device_inline float4 operator-(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, float f);
-ccl_device_inline float4 operator/=(float4 &a, float f);
-
-ccl_device_inline int4 operator<(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b);
-ccl_device_inline bool operator==(const float4 &a, const float4 &b);
-
-ccl_device_inline float distance(const float4 &a, const float4 &b);
-ccl_device_inline float dot(const float4 &a, const float4 &b);
-ccl_device_inline float len_squared(const float4 &a);
-ccl_device_inline float4 rcp(const float4 &a);
-ccl_device_inline float4 sqrt(const float4 &a);
-ccl_device_inline float4 sqr(const float4 &a);
-ccl_device_inline float4 cross(const float4 &a, const float4 &b);
-ccl_device_inline bool is_zero(const float4 &a);
-ccl_device_inline float average(const float4 &a);
-ccl_device_inline float len(const float4 &a);
-ccl_device_inline float4 normalize(const float4 &a);
-ccl_device_inline float4 safe_normalize(const float4 &a);
-ccl_device_inline float4 min(const float4 &a, const float4 &b);
-ccl_device_inline float4 max(const float4 &a, const float4 &b);
-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx);
-ccl_device_inline float4 fabs(const float4 &a);
-ccl_device_inline float4 floor(const float4 &a);
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
-#endif /* !__KERNEL_METAL__*/
-
-ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
-ccl_device_inline float4 safe_divide(const float4 a, const float b);
-
-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b);
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b);
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b);
-
-# ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b);
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
-# endif
-#endif /* __KERNEL_SSE__ */
-
-ccl_device_inline float reduce_min(const float4 a);
-ccl_device_inline float reduce_max(const float4 a);
-ccl_device_inline float reduce_add(const float4 a);
-
-ccl_device_inline bool isequal(const float4 a, const float4 b);
-
-#ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
-#endif /* !__KERNEL_GPU__ */
-
-/*******************************************************************************
- * Definition.
- */
-
ccl_device_inline float4 zero_float4()
{
#ifdef __KERNEL_SSE__
@@ -103,6 +25,16 @@ ccl_device_inline float4 one_float4()
return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
}
+ccl_device_inline int4 cast(const float4 a)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_castps_si128(a));
+#else
+ return make_int4(
+ __float_as_int(a.x), __float_as_int(a.y), __float_as_int(a.z), __float_as_int(a.w));
+#endif
+}
+
#if !defined(__KERNEL_METAL__)
ccl_device_inline float4 operator-(const float4 &a)
{
@@ -114,7 +46,7 @@ ccl_device_inline float4 operator-(const float4 &a)
# endif
}
-ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator*(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return float4(_mm_mul_ps(a.m128, b.m128));
@@ -123,7 +55,7 @@ ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline float4 operator*(const float4 &a, float f)
+ccl_device_inline float4 operator*(const float4 a, float f)
{
# if defined(__KERNEL_SSE__)
return a * make_float4(f);
@@ -132,17 +64,17 @@ ccl_device_inline float4 operator*(const float4 &a, float f)
# endif
}
-ccl_device_inline float4 operator*(float f, const float4 &a)
+ccl_device_inline float4 operator*(float f, const float4 a)
{
return a * f;
}
-ccl_device_inline float4 operator/(const float4 &a, float f)
+ccl_device_inline float4 operator/(const float4 a, float f)
{
return a * (1.0f / f);
}
-ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator/(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return float4(_mm_div_ps(a.m128, b.m128));
@@ -151,12 +83,7 @@ ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline float4 operator+(const float4 &a, const float f)
-{
- return a + make_float4(f, f, f, f);
-}
-
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator+(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return float4(_mm_add_ps(a.m128, b.m128));
@@ -165,12 +92,12 @@ ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline float4 operator-(const float4 &a, const float f)
+ccl_device_inline float4 operator+(const float4 a, const float f)
{
- return a - make_float4(f, f, f, f);
+ return a + make_float4(f);
}
-ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return float4(_mm_sub_ps(a.m128, b.m128));
@@ -179,17 +106,22 @@ ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float f)
+{
+ return a - make_float4(f);
+}
+
+ccl_device_inline float4 operator+=(float4 &a, const float4 b)
{
return a = a + b;
}
-ccl_device_inline float4 operator-=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-=(float4 &a, const float4 b)
{
return a = a - b;
}
-ccl_device_inline float4 operator*=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator*=(float4 &a, const float4 b)
{
return a = a * b;
}
@@ -204,7 +136,7 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
return a = a / f;
}
-ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
@@ -213,7 +145,7 @@ ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator>=(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
@@ -222,7 +154,7 @@ ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<=(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
@@ -231,7 +163,7 @@ ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline bool operator==(const float4 &a, const float4 &b)
+ccl_device_inline bool operator==(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
@@ -240,160 +172,148 @@ ccl_device_inline bool operator==(const float4 &a, const float4 &b)
# endif
}
-ccl_device_inline float distance(const float4 &a, const float4 &b)
-{
- return len(a - b);
-}
-
-ccl_device_inline float dot(const float4 &a, const float4 &b)
+ccl_device_inline const float4 operator^(const float4 a, const float4 b)
{
-# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
- __m128 t = vmulq_f32(a, b);
- return vaddvq_f32(t);
-# else
- return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-# endif
+# ifdef __KERNEL_SSE__
+ return float4(_mm_xor_ps(a.m128, b.m128));
# else
- return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+ return make_float4(__uint_as_float(__float_as_uint(a.x) ^ __float_as_uint(b.x)),
+ __uint_as_float(__float_as_uint(a.y) ^ __float_as_uint(b.y)),
+ __uint_as_float(__float_as_uint(a.z) ^ __float_as_uint(b.z)),
+ __uint_as_float(__float_as_uint(a.w) ^ __float_as_uint(b.w)));
# endif
}
-ccl_device_inline float len_squared(const float4 &a)
-{
- return dot(a, a);
-}
-
-ccl_device_inline float4 rcp(const float4 &a)
+ccl_device_inline float4 min(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
- /* Don't use _mm_rcp_ps due to poor precision. */
- return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
+ return float4(_mm_min_ps(a.m128, b.m128));
# else
- return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+ return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
# endif
}
-ccl_device_inline float4 sqrt(const float4 &a)
+ccl_device_inline float4 max(const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
- return float4(_mm_sqrt_ps(a.m128));
+ return float4(_mm_max_ps(a.m128, b.m128));
# else
- return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+ return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
# endif
}
-ccl_device_inline float4 sqr(const float4 &a)
+ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
{
- return a * a;
+ return min(max(a, mn), mx);
}
+#endif /* !__KERNEL_METAL__*/
-ccl_device_inline float4 cross(const float4 &a, const float4 &b)
+ccl_device_inline const float4 madd(const float4 a, const float4 b, const float4 c)
{
-# ifdef __KERNEL_SSE__
- return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
- (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
+#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_NEON__
+ return float4(vfmaq_f32(c, a, b));
+# elif defined(__KERNEL_AVX2__)
+ return float4(_mm_fmadd_ps(a, b, c));
# else
- return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+ return a * b + c;
# endif
+#else
+ return a * b + c;
+#endif
}
-ccl_device_inline bool is_zero(const float4 &a)
+ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
{
-# ifdef __KERNEL_SSE__
- return a == zero_float4();
+#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_NEON__
+ return float4(vfmaq_f32(vnegq_f32(c), a, b));
+# elif defined(__KERNEL_AVX2__)
+ return float4(_mm_fmsub_ps(a, b, c));
# else
- return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+ return a * b - c;
# endif
+#else
+ return a * b - c;
+#endif
}
-ccl_device_inline float average(const float4 &a)
+#ifdef __KERNEL_SSE__
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 b)
{
- return reduce_add(a) * 0.25f;
+# ifdef __KERNEL_NEON__
+ return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128));
+# else
+ return float4(
+ _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+# endif
}
-ccl_device_inline float len(const float4 &a)
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a)
{
- return sqrtf(dot(a, a));
+ return float4(_mm_movelh_ps(a, a));
}
-ccl_device_inline float4 normalize(const float4 &a)
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a)
{
- return a / len(a);
+ return float4(_mm_movehl_ps(a, a));
}
-ccl_device_inline float4 safe_normalize(const float4 &a)
+# ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 b)
{
- float t = len(a);
- return (t != 0.0f) ? a / t : a;
+ return float4(_mm_moveldup_ps(b));
}
-ccl_device_inline float4 min(const float4 &a, const float4 &b)
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 b)
{
-# ifdef __KERNEL_SSE__
- return float4(_mm_min_ps(a.m128, b.m128));
-# else
- return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-# endif
+ return float4(_mm_movehdup_ps(b));
}
+# endif /* __KERNEL_SSE3__ */
-ccl_device_inline float4 max(const float4 &a, const float4 &b)
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 a, const float4 b)
{
-# ifdef __KERNEL_SSE__
- return float4(_mm_max_ps(a.m128, b.m128));
+# ifdef __KERNEL_NEON__
+ return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b));
# else
- return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+ return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
# endif
}
-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
+template<size_t i0> __forceinline const float4 shuffle(const float4 b)
{
- return min(max(a, mn), mx);
+ return shuffle<i0, i0, i0, i0>(b);
}
-
-ccl_device_inline float4 fabs(const float4 &a)
+template<size_t i0> __forceinline const float4 shuffle(const float4 a, const float4 b)
{
-# if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
- return float4(vabsq_f32(a));
-# else
- return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-# endif
-# else
- return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-# endif
-}
-
-ccl_device_inline float4 floor(const float4 &a)
-{
-# ifdef __KERNEL_SSE__
- return float4(_mm_floor_ps(a));
+# ifdef __KERNEL_NEON__
+ return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b));
# else
- return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+ return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0)));
# endif
}
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
{
- return a + t * (b - a);
+ return float4(_mm_movelh_ps(a, b));
}
-ccl_device_inline float4 saturate(const float4 &a)
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
{
- return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+ return float4(_mm_movehl_ps(b, a));
}
-ccl_device_inline float4 exp(float4 v)
+template<size_t i> __forceinline float extract(const float4 a)
{
- return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+ return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
}
-
-ccl_device_inline float4 log(float4 v)
+template<> __forceinline float extract<0>(const float4 a)
{
- return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+ return _mm_cvtss_f32(a);
}
-
-#endif /* !__KERNEL_METAL__*/
+#endif
ccl_device_inline float reduce_add(const float4 a)
{
@@ -440,77 +360,192 @@ ccl_device_inline float reduce_max(const float4 a)
#endif
}
-ccl_device_inline bool isequal(const float4 a, const float4 b)
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float dot(const float4 a, const float4 b)
{
-#if defined(__KERNEL_METAL__)
- return all(a == b);
-#else
- return a == b;
-#endif
+# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ __m128 t = vmulq_f32(a, b);
+ return vaddvq_f32(t);
+# else
+ return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+# endif
+# else
+ return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+# endif
}
+#endif /* !defined(__KERNEL_METAL__) */
-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b)
+ccl_device_inline float len(const float4 a)
{
-# if defined(__KERNEL_NEON__)
- return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
+ return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float len_squared(const float4 a)
+{
+ return dot(a, a);
+}
+
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float4 a, const float4 b)
+{
+ return len(a - b);
+}
+
+ccl_device_inline float4 rcp(const float4 a)
+{
+# ifdef __KERNEL_SSE__
+ /* Don't use _mm_rcp_ps due to poor precision. */
+ return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
# else
- return float4(_mm_castsi128_ps(
- _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+ return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
# endif
}
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b)
+ccl_device_inline float4 sqrt(const float4 a)
{
-# if defined(__KERNEL_NEON__)
- return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
+# ifdef __KERNEL_SSE__
+ return float4(_mm_sqrt_ps(a.m128));
# else
- return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+ return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
# endif
}
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
+ccl_device_inline float4 sqr(const float4 a)
{
- return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
+ return a * a;
}
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b)
+ccl_device_inline float4 cross(const float4 a, const float4 b)
{
- return float4(_mm_movelh_ps(a.m128, b.m128));
+# ifdef __KERNEL_SSE__
+ return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
+ (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
+# else
+ return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+# endif
}
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b)
+ccl_device_inline bool is_zero(const float4 a)
{
- return float4(_mm_movehl_ps(b.m128, a.m128));
+# ifdef __KERNEL_SSE__
+ return a == zero_float4();
+# else
+ return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+# endif
}
-# ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b)
+ccl_device_inline float average(const float4 a)
{
- return float4(_mm_moveldup_ps(b));
+ return reduce_add(a) * 0.25f;
}
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b)
+ccl_device_inline float4 normalize(const float4 a)
{
- return float4(_mm_movehdup_ps(b));
+ return a / len(a);
+}
+
+ccl_device_inline float4 safe_normalize(const float4 a)
+{
+ float t = len(a);
+ return (t != 0.0f) ? a / t : a;
+}
+
+ccl_device_inline float4 fabs(const float4 a)
+{
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
+ return float4(vabsq_f32(a));
+# else
+ return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+# endif
+# else
+ return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+# endif
+}
+
+ccl_device_inline float4 floor(const float4 a)
+{
+# ifdef __KERNEL_SSE__
+# if defined(__KERNEL_NEON__)
+ return float4(vrndmq_f32(a));
+# else
+ return float4(_mm_floor_ps(a));
+# endif
+# else
+ return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+# endif
+}
+
+ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
+{
+# ifdef __KERNEL_SSE__
+ const float4 f = floor(x);
+ *i = int4(_mm_cvttps_epi32(f.m128));
+ return x - f;
+# else
+ float4 r;
+ r.x = floorfrac(x.x, &i->x);
+ r.y = floorfrac(x.y, &i->y);
+ r.z = floorfrac(x.z, &i->z);
+ r.w = floorfrac(x.w, &i->w);
+ return r;
+# endif
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, float t)
+{
+ return a + t * (b - a);
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, const float4 t)
+{
+ return a + t * (b - a);
+}
+
+ccl_device_inline float4 saturate(const float4 a)
+{
+ return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+}
+
+ccl_device_inline float4 exp(float4 v)
+{
+ return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+}
+
+ccl_device_inline float4 log(float4 v)
+{
+ return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+}
+
+#endif /* !__KERNEL_METAL__*/
+
+ccl_device_inline bool isequal(const float4 a, const float4 b)
+{
+#if defined(__KERNEL_METAL__)
+ return all(a == b);
+#else
+ return a == b;
+#endif
}
-# endif /* __KERNEL_SSE3__ */
-#endif /* __KERNEL_SSE__ */
#ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b)
+ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
{
# ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE41__
return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
+# else
+ return float4(
+ _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), a), _mm_andnot_ps(_mm_castsi128_ps(mask), b)));
+# endif
# else
return make_float4(
(mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w);
# endif
}
-ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
+ccl_device_inline float4 mask(const int4 mask, const float4 a)
{
/* Replace elements of x with zero where mask isn't set. */
return select(mask, a, zero_float4());
diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h
index b538cfbe70b..755a720a10b 100644
--- a/intern/cycles/util/math_float8.h
+++ b/intern/cycles/util/math_float8.h
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
* Copyright 2022 Blender Foundation */
#ifndef __UTIL_MATH_FLOAT8_H__
@@ -10,193 +11,138 @@
CCL_NAMESPACE_BEGIN
-/*******************************************************************************
- * Declaration.
- */
-
-ccl_device_inline float8_t operator+(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator+(const float8_t a, const float f);
-ccl_device_inline float8_t operator+(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator-(const float8_t a);
-ccl_device_inline float8_t operator-(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator-(const float8_t a, const float f);
-ccl_device_inline float8_t operator-(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator*(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator*(const float8_t a, const float f);
-ccl_device_inline float8_t operator*(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator/(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator/(const float8_t a, float f);
-ccl_device_inline float8_t operator/(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator+=(float8_t a, const float8_t b);
-
-ccl_device_inline float8_t operator*=(float8_t a, const float8_t b);
-ccl_device_inline float8_t operator*=(float8_t a, float f);
-
-ccl_device_inline float8_t operator/=(float8_t a, float f);
-
-ccl_device_inline bool operator==(const float8_t a, const float8_t b);
-
-ccl_device_inline float8_t rcp(const float8_t a);
-ccl_device_inline float8_t sqrt(const float8_t a);
-ccl_device_inline float8_t sqr(const float8_t a);
-ccl_device_inline bool is_zero(const float8_t a);
-ccl_device_inline float average(const float8_t a);
-ccl_device_inline float8_t min(const float8_t a, const float8_t b);
-ccl_device_inline float8_t max(const float8_t a, const float8_t b);
-ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx);
-ccl_device_inline float8_t fabs(const float8_t a);
-ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t);
-ccl_device_inline float8_t saturate(const float8_t a);
-
-ccl_device_inline float8_t safe_divide(const float8_t a, const float b);
-ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b);
-
-ccl_device_inline float reduce_min(const float8_t a);
-ccl_device_inline float reduce_max(const float8_t a);
-ccl_device_inline float reduce_add(const float8_t a);
-
-ccl_device_inline bool isequal(const float8_t a, const float8_t b);
-
-/*******************************************************************************
- * Definition.
- */
-
-ccl_device_inline float8_t zero_float8_t()
+ccl_device_inline vfloat8 zero_vfloat8()
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_setzero_ps());
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_setzero_ps());
#else
- return make_float8_t(0.0f);
+ return make_vfloat8(0.0f);
#endif
}
-ccl_device_inline float8_t one_float8_t()
+ccl_device_inline vfloat8 one_vfloat8()
{
- return make_float8_t(1.0f);
+ return make_vfloat8(1.0f);
}
-ccl_device_inline float8_t operator+(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator+(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_add_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_add_ps(a.m256, b.m256));
#else
- return make_float8_t(
+ return make_vfloat8(
a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
#endif
}
-ccl_device_inline float8_t operator+(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator+(const vfloat8 a, const float f)
{
- return a + make_float8_t(f);
+ return a + make_vfloat8(f);
}
-ccl_device_inline float8_t operator+(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator+(const float f, const vfloat8 a)
{
- return make_float8_t(f) + a;
+ return make_vfloat8(f) + a;
}
-ccl_device_inline float8_t operator-(const float8_t a)
+ccl_device_inline vfloat8 operator-(const vfloat8 a)
{
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
__m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
- return float8_t(_mm256_xor_ps(a.m256, mask));
+ return vfloat8(_mm256_xor_ps(a.m256, mask));
#else
- return make_float8_t(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
+ return make_vfloat8(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
#endif
}
-ccl_device_inline float8_t operator-(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator-(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_sub_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_sub_ps(a.m256, b.m256));
#else
- return make_float8_t(
+ return make_vfloat8(
a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
#endif
}
-ccl_device_inline float8_t operator-(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator-(const vfloat8 a, const float f)
{
- return a - make_float8_t(f);
+ return a - make_vfloat8(f);
}
-ccl_device_inline float8_t operator-(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator-(const float f, const vfloat8 a)
{
- return make_float8_t(f) - a;
+ return make_vfloat8(f) - a;
}
-ccl_device_inline float8_t operator*(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator*(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_mul_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_mul_ps(a.m256, b.m256));
#else
- return make_float8_t(
+ return make_vfloat8(
a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
#endif
}
-ccl_device_inline float8_t operator*(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator*(const vfloat8 a, const float f)
{
- return a * make_float8_t(f);
+ return a * make_vfloat8(f);
}
-ccl_device_inline float8_t operator*(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator*(const float f, const vfloat8 a)
{
- return make_float8_t(f) * a;
+ return make_vfloat8(f) * a;
}
-ccl_device_inline float8_t operator/(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator/(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_div_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_div_ps(a.m256, b.m256));
#else
- return make_float8_t(
+ return make_vfloat8(
a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
#endif
}
-ccl_device_inline float8_t operator/(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator/(const vfloat8 a, const float f)
{
- return a / make_float8_t(f);
+ return a / make_vfloat8(f);
}
-ccl_device_inline float8_t operator/(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator/(const float f, const vfloat8 a)
{
- return make_float8_t(f) / a;
+ return make_vfloat8(f) / a;
}
-ccl_device_inline float8_t operator+=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator+=(vfloat8 a, const vfloat8 b)
{
return a = a + b;
}
-ccl_device_inline float8_t operator-=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator-=(vfloat8 a, const vfloat8 b)
{
return a = a - b;
}
-ccl_device_inline float8_t operator*=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator*=(vfloat8 a, const vfloat8 b)
{
return a = a * b;
}
-ccl_device_inline float8_t operator*=(float8_t a, float f)
+ccl_device_inline vfloat8 operator*=(vfloat8 a, float f)
{
return a = a * f;
}
-ccl_device_inline float8_t operator/=(float8_t a, float f)
+ccl_device_inline vfloat8 operator/=(vfloat8 a, float f)
{
return a = a / f;
}
-ccl_device_inline bool operator==(const float8_t a, const float8_t b)
+ccl_device_inline bool operator==(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
return (_mm256_movemask_ps(_mm256_castsi256_ps(
_mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
0b11111111) == 0b11111111;
@@ -206,132 +152,180 @@ ccl_device_inline bool operator==(const float8_t a, const float8_t b)
#endif
}
-ccl_device_inline float8_t rcp(const float8_t a)
+ccl_device_inline const vfloat8 operator^(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_rcp_ps(a.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_xor_ps(a.m256, b.m256));
#else
- return make_float8_t(1.0f / a.a,
- 1.0f / a.b,
- 1.0f / a.c,
- 1.0f / a.d,
- 1.0f / a.e,
- 1.0f / a.f,
- 1.0f / a.g,
- 1.0f / a.h);
+ return make_vfloat8(__uint_as_float(__float_as_uint(a.a) ^ __float_as_uint(b.a)),
+ __uint_as_float(__float_as_uint(a.b) ^ __float_as_uint(b.b)),
+ __uint_as_float(__float_as_uint(a.c) ^ __float_as_uint(b.c)),
+ __uint_as_float(__float_as_uint(a.d) ^ __float_as_uint(b.d)),
+ __uint_as_float(__float_as_uint(a.e) ^ __float_as_uint(b.e)),
+ __uint_as_float(__float_as_uint(a.f) ^ __float_as_uint(b.f)),
+ __uint_as_float(__float_as_uint(a.g) ^ __float_as_uint(b.g)),
+ __uint_as_float(__float_as_uint(a.h) ^ __float_as_uint(b.h)));
#endif
}
-ccl_device_inline float8_t sqrt(const float8_t a)
+ccl_device_inline vfloat8 rcp(const vfloat8 a)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_sqrt_ps(a.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_rcp_ps(a.m256));
#else
- return make_float8_t(sqrtf(a.a),
- sqrtf(a.b),
- sqrtf(a.c),
- sqrtf(a.d),
- sqrtf(a.e),
- sqrtf(a.f),
- sqrtf(a.g),
- sqrtf(a.h));
+ return make_vfloat8(1.0f / a.a,
+ 1.0f / a.b,
+ 1.0f / a.c,
+ 1.0f / a.d,
+ 1.0f / a.e,
+ 1.0f / a.f,
+ 1.0f / a.g,
+ 1.0f / a.h);
#endif
}
-ccl_device_inline float8_t sqr(const float8_t a)
+ccl_device_inline vfloat8 sqrt(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_sqrt_ps(a.m256));
+#else
+ return make_vfloat8(sqrtf(a.a),
+ sqrtf(a.b),
+ sqrtf(a.c),
+ sqrtf(a.d),
+ sqrtf(a.e),
+ sqrtf(a.f),
+ sqrtf(a.g),
+ sqrtf(a.h));
+#endif
+}
+
+ccl_device_inline vfloat8 sqr(const vfloat8 a)
{
return a * a;
}
-ccl_device_inline bool is_zero(const float8_t a)
+ccl_device_inline bool is_zero(const vfloat8 a)
{
- return a == make_float8_t(0.0f);
+ return a == make_vfloat8(0.0f);
}
-ccl_device_inline float average(const float8_t a)
+ccl_device_inline float reduce_add(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+ vfloat8 b(_mm256_hadd_ps(a.m256, a.m256));
+ vfloat8 h(_mm256_hadd_ps(b.m256, b.m256));
+ return h[0] + h[4];
+#else
+ return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
+#endif
+}
+
+ccl_device_inline float average(const vfloat8 a)
{
return reduce_add(a) / 8.0f;
}
-ccl_device_inline float8_t min(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 min(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_min_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_min_ps(a.m256, b.m256));
#else
- return make_float8_t(min(a.a, b.a),
- min(a.b, b.b),
- min(a.c, b.c),
- min(a.d, b.d),
- min(a.e, b.e),
- min(a.f, b.f),
- min(a.g, b.g),
- min(a.h, b.h));
+ return make_vfloat8(min(a.a, b.a),
+ min(a.b, b.b),
+ min(a.c, b.c),
+ min(a.d, b.d),
+ min(a.e, b.e),
+ min(a.f, b.f),
+ min(a.g, b.g),
+ min(a.h, b.h));
#endif
}
-ccl_device_inline float8_t max(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 max(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_max_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_max_ps(a.m256, b.m256));
#else
- return make_float8_t(max(a.a, b.a),
- max(a.b, b.b),
- max(a.c, b.c),
- max(a.d, b.d),
- max(a.e, b.e),
- max(a.f, b.f),
- max(a.g, b.g),
- max(a.h, b.h));
+ return make_vfloat8(max(a.a, b.a),
+ max(a.b, b.b),
+ max(a.c, b.c),
+ max(a.d, b.d),
+ max(a.e, b.e),
+ max(a.f, b.f),
+ max(a.g, b.g),
+ max(a.h, b.h));
#endif
}
-ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx)
+ccl_device_inline vfloat8 clamp(const vfloat8 a, const vfloat8 mn, const vfloat8 mx)
{
return min(max(a, mn), mx);
}
-ccl_device_inline float8_t fabs(const float8_t a)
+ccl_device_inline vfloat8 select(const vint8 mask, const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- return float8_t(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_blendv_ps(b, a, _mm256_castsi256_ps(mask)));
#else
- return make_float8_t(fabsf(a.a),
- fabsf(a.b),
- fabsf(a.c),
- fabsf(a.d),
- fabsf(a.e),
- fabsf(a.f),
- fabsf(a.g),
- fabsf(a.h));
+ return make_vfloat8((mask.a) ? a.a : b.a,
+ (mask.b) ? a.b : b.b,
+ (mask.c) ? a.c : b.c,
+ (mask.d) ? a.d : b.d,
+ (mask.e) ? a.e : b.e,
+ (mask.f) ? a.f : b.f,
+ (mask.g) ? a.g : b.g,
+ (mask.h) ? a.h : b.h);
#endif
}
-ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t)
+ccl_device_inline vfloat8 fabs(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#else
+ return make_vfloat8(fabsf(a.a),
+ fabsf(a.b),
+ fabsf(a.c),
+ fabsf(a.d),
+ fabsf(a.e),
+ fabsf(a.f),
+ fabsf(a.g),
+ fabsf(a.h));
+#endif
+}
+
+ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, float t)
+{
+ return a + t * (b - a);
+}
+
+ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, vfloat8 t)
{
return a + t * (b - a);
}
-ccl_device_inline float8_t saturate(const float8_t a)
+ccl_device_inline vfloat8 saturate(const vfloat8 a)
{
- return clamp(a, make_float8_t(0.0f), make_float8_t(1.0f));
+ return clamp(a, make_vfloat8(0.0f), make_vfloat8(1.0f));
}
-ccl_device_inline float8_t exp(float8_t v)
+ccl_device_inline vfloat8 exp(vfloat8 v)
{
- return make_float8_t(
+ return make_vfloat8(
expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
}
-ccl_device_inline float8_t log(float8_t v)
+ccl_device_inline vfloat8 log(vfloat8 v)
{
- return make_float8_t(
+ return make_vfloat8(
logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
}
-ccl_device_inline float dot(const float8_t a, const float8_t b)
+ccl_device_inline float dot(const vfloat8 a, const vfloat8 b)
{
-#ifdef __KERNEL_AVX2__
- float8_t t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
+#ifdef __KERNEL_AVX__
+ vfloat8 t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
return t[0] + t[4];
#else
return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
@@ -339,62 +333,51 @@ ccl_device_inline float dot(const float8_t a, const float8_t b)
#endif
}
-ccl_device_inline float8_t pow(float8_t v, float e)
+ccl_device_inline vfloat8 pow(vfloat8 v, float e)
{
- return make_float8_t(powf(v.a, e),
- powf(v.b, e),
- powf(v.c, e),
- powf(v.d, e),
- powf(v.e, e),
- powf(v.f, e),
- powf(v.g, e),
- powf(v.h, e));
+ return make_vfloat8(powf(v.a, e),
+ powf(v.b, e),
+ powf(v.c, e),
+ powf(v.d, e),
+ powf(v.e, e),
+ powf(v.f, e),
+ powf(v.g, e),
+ powf(v.h, e));
}
-ccl_device_inline float reduce_min(const float8_t a)
+ccl_device_inline float reduce_min(const vfloat8 a)
{
return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
}
-ccl_device_inline float reduce_max(const float8_t a)
+ccl_device_inline float reduce_max(const vfloat8 a)
{
return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
}
-ccl_device_inline float reduce_add(const float8_t a)
-{
-#ifdef __KERNEL_AVX2__
- float8_t b(_mm256_hadd_ps(a.m256, a.m256));
- float8_t h(_mm256_hadd_ps(b.m256, b.m256));
- return h[0] + h[4];
-#else
- return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
-#endif
-}
-
-ccl_device_inline bool isequal(const float8_t a, const float8_t b)
+ccl_device_inline bool isequal(const vfloat8 a, const vfloat8 b)
{
return a == b;
}
-ccl_device_inline float8_t safe_divide(const float8_t a, const float b)
+ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const float b)
{
- return (b != 0.0f) ? a / b : make_float8_t(0.0f);
+ return (b != 0.0f) ? a / b : make_vfloat8(0.0f);
}
-ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const vfloat8 b)
{
- return make_float8_t((b.a != 0.0f) ? a.a / b.a : 0.0f,
- (b.b != 0.0f) ? a.b / b.b : 0.0f,
- (b.c != 0.0f) ? a.c / b.c : 0.0f,
- (b.d != 0.0f) ? a.d / b.d : 0.0f,
- (b.e != 0.0f) ? a.e / b.e : 0.0f,
- (b.f != 0.0f) ? a.f / b.f : 0.0f,
- (b.g != 0.0f) ? a.g / b.g : 0.0f,
- (b.h != 0.0f) ? a.h / b.h : 0.0f);
+ return make_vfloat8((b.a != 0.0f) ? a.a / b.a : 0.0f,
+ (b.b != 0.0f) ? a.b / b.b : 0.0f,
+ (b.c != 0.0f) ? a.c / b.c : 0.0f,
+ (b.d != 0.0f) ? a.d / b.d : 0.0f,
+ (b.e != 0.0f) ? a.e / b.e : 0.0f,
+ (b.f != 0.0f) ? a.f / b.f : 0.0f,
+ (b.g != 0.0f) ? a.g / b.g : 0.0f,
+ (b.h != 0.0f) ? a.h / b.h : 0.0f);
}
-ccl_device_inline float8_t ensure_finite(float8_t v)
+ccl_device_inline vfloat8 ensure_finite(vfloat8 v)
{
v.a = ensure_finite(v.a);
v.b = ensure_finite(v.b);
@@ -408,12 +391,92 @@ ccl_device_inline float8_t ensure_finite(float8_t v)
return v;
}
-ccl_device_inline bool isfinite_safe(float8_t v)
+ccl_device_inline bool isfinite_safe(vfloat8 v)
{
return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
}
+ccl_device_inline vint8 cast(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+ return vint8(_mm256_castps_si256(a));
+#else
+ return make_vint8(__float_as_int(a.a),
+ __float_as_int(a.b),
+ __float_as_int(a.c),
+ __float_as_int(a.d),
+ __float_as_int(a.e),
+ __float_as_int(a.f),
+ __float_as_int(a.g),
+ __float_as_int(a.h));
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+ccl_device_forceinline float4 low(const vfloat8 a)
+{
+# ifdef __KERNEL_AVX__
+ return float4(_mm256_extractf128_ps(a.m256, 0));
+# else
+ return make_float4(a.e, a.f, a.g, a.h);
+# endif
+}
+ccl_device_forceinline float4 high(const vfloat8 a)
+{
+# ifdef __KERNEL_AVX__
+ return float4(_mm256_extractf128_ps(a.m256, 1));
+# else
+ return make_float4(a.a, a.b, a.c, a.d);
+# endif
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+# ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0)));
+# else
+ return make_vfloat8(a[i0], a[i1], a[i2], a[i3], a[i4 + 4], a[i5 + 4], a[i6 + 4], a[i7 + 4]);
+# endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
+# else
+ return make_vfloat8(shuffle<i0, i1, i2, i3>(high(a), high(b)),
+ shuffle<i0, i1, i2, i3>(low(a), low(b)));
+# endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+ return shuffle<i0, i1, i2, i3>(a, a);
+}
+template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
+{
+ return shuffle<i0, i0, i0, i0>(a, b);
+}
+template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+ return shuffle<i0>(a, a);
+}
+
+template<size_t i> ccl_device_forceinline float extract(const vfloat8 a)
+{
+# ifdef __KERNEL_AVX__
+ __m256 b = shuffle<i, i, i, i>(a).m256;
+ return _mm256_cvtss_f32(b);
+# else
+ return a[i];
+# endif
+}
+#endif
+
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_FLOAT8_H__ */
diff --git a/intern/cycles/util/math_int2.h b/intern/cycles/util/math_int2.h
index f4d8a71221a..2df2ec5505b 100644
--- a/intern/cycles/util/math_int2.h
+++ b/intern/cycles/util/math_int2.h
@@ -10,23 +10,6 @@
CCL_NAMESPACE_BEGIN
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline bool operator==(const int2 a, const int2 b);
-ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
-ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
-#endif /* !__KERNEL_METAL__ */
-
-/*******************************************************************************
- * Definition.
- */
-
#if !defined(__KERNEL_METAL__)
ccl_device_inline bool operator==(const int2 a, const int2 b)
{
diff --git a/intern/cycles/util/math_int3.h b/intern/cycles/util/math_int3.h
index 48bffeaf553..b5b972ddfb5 100644
--- a/intern/cycles/util/math_int3.h
+++ b/intern/cycles/util/math_int3.h
@@ -10,21 +10,6 @@
CCL_NAMESPACE_BEGIN
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline int3 min(int3 a, int3 b);
-ccl_device_inline int3 max(int3 a, int3 b);
-ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
-ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
-#endif /* !defined(__KERNEL_METAL__) */
-
-/*******************************************************************************
- * Definition.
- */
-
#if !defined(__KERNEL_METAL__)
ccl_device_inline int3 min(int3 a, int3 b)
{
@@ -44,7 +29,7 @@ ccl_device_inline int3 max(int3 a, int3 b)
# endif
}
-ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
+ccl_device_inline int3 clamp(const int3 a, int mn, int mx)
{
# ifdef __KERNEL_SSE__
return min(max(a, make_int3(mn)), make_int3(mx));
@@ -53,7 +38,7 @@ ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
# endif
}
-ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
+ccl_device_inline int3 clamp(const int3 a, int3 &mn, int mx)
{
# ifdef __KERNEL_SSE__
return min(max(a, mn), make_int3(mx));
@@ -62,22 +47,22 @@ ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
# endif
}
-ccl_device_inline bool operator==(const int3 &a, const int3 &b)
+ccl_device_inline bool operator==(const int3 a, const int3 b)
{
return a.x == b.x && a.y == b.y && a.z == b.z;
}
-ccl_device_inline bool operator!=(const int3 &a, const int3 &b)
+ccl_device_inline bool operator!=(const int3 a, const int3 b)
{
return !(a == b);
}
-ccl_device_inline bool operator<(const int3 &a, const int3 &b)
+ccl_device_inline bool operator<(const int3 a, const int3 b)
{
return a.x < b.x && a.y < b.y && a.z < b.z;
}
-ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
+ccl_device_inline int3 operator+(const int3 a, const int3 b)
{
# ifdef __KERNEL_SSE__
return int3(_mm_add_epi32(a.m128, b.m128));
@@ -86,7 +71,7 @@ ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
# endif
}
-ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
+ccl_device_inline int3 operator-(const int3 a, const int3 b)
{
# ifdef __KERNEL_SSE__
return int3(_mm_sub_epi32(a.m128, b.m128));
diff --git a/intern/cycles/util/math_int4.h b/intern/cycles/util/math_int4.h
index fbdada223cb..c6d767d7587 100644
--- a/intern/cycles/util/math_int4.h
+++ b/intern/cycles/util/math_int4.h
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
* Copyright 2011-2022 Blender Foundation */
#ifndef __UTIL_MATH_INT4_H__
@@ -10,30 +11,8 @@
CCL_NAMESPACE_BEGIN
-/*******************************************************************************
- * Declaration.
- */
-
#ifndef __KERNEL_GPU__
-ccl_device_inline int4 operator+(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator+=(int4 &a, const int4 &b);
-ccl_device_inline int4 operator>>(const int4 &a, int i);
-ccl_device_inline int4 operator<<(const int4 &a, int i);
-ccl_device_inline int4 operator<(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator>=(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator&(const int4 &a, const int4 &b);
-ccl_device_inline int4 min(int4 a, int4 b);
-ccl_device_inline int4 max(int4 a, int4 b);
-ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx);
-ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b);
-#endif /* __KERNEL_GPU__ */
-
-/*******************************************************************************
- * Definition.
- */
-
-#ifndef __KERNEL_GPU__
-ccl_device_inline int4 operator+(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator+(const int4 a, const int4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_add_epi32(a.m128, b.m128));
@@ -42,12 +21,26 @@ ccl_device_inline int4 operator+(const int4 &a, const int4 &b)
# endif
}
-ccl_device_inline int4 operator+=(int4 &a, const int4 &b)
+ccl_device_inline int4 operator+=(int4 &a, const int4 b)
{
return a = a + b;
}
-ccl_device_inline int4 operator>>(const int4 &a, int i)
+ccl_device_inline int4 operator-(const int4 a, const int4 b)
+{
+# ifdef __KERNEL_SSE__
+ return int4(_mm_sub_epi32(a.m128, b.m128));
+# else
+ return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+# endif
+}
+
+ccl_device_inline int4 operator-=(int4 &a, const int4 b)
+{
+ return a = a - b;
+}
+
+ccl_device_inline int4 operator>>(const int4 a, int i)
{
# ifdef __KERNEL_SSE__
return int4(_mm_srai_epi32(a.m128, i));
@@ -56,7 +49,7 @@ ccl_device_inline int4 operator>>(const int4 &a, int i)
# endif
}
-ccl_device_inline int4 operator<<(const int4 &a, int i)
+ccl_device_inline int4 operator<<(const int4 a, int i)
{
# ifdef __KERNEL_SSE__
return int4(_mm_slli_epi32(a.m128, i));
@@ -65,7 +58,7 @@ ccl_device_inline int4 operator<<(const int4 &a, int i)
# endif
}
-ccl_device_inline int4 operator<(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator<(const int4 a, const int4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_cmplt_epi32(a.m128, b.m128));
@@ -74,7 +67,26 @@ ccl_device_inline int4 operator<(const int4 &a, const int4 &b)
# endif
}
-ccl_device_inline int4 operator>=(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator<(const int4 a, const int b)
+{
+ return a < make_int4(b);
+}
+
+ccl_device_inline int4 operator==(const int4 a, const int4 b)
+{
+# ifdef __KERNEL_SSE__
+ return int4(_mm_cmpeq_epi32(a.m128, b.m128));
+# else
+ return make_int4(a.x == b.x, a.y == b.y, a.z == b.z, a.w == b.w);
+# endif
+}
+
+ccl_device_inline int4 operator==(const int4 a, const int b)
+{
+ return a == make_int4(b);
+}
+
+ccl_device_inline int4 operator>=(const int4 a, const int4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128)));
@@ -83,7 +95,12 @@ ccl_device_inline int4 operator>=(const int4 &a, const int4 &b)
# endif
}
-ccl_device_inline int4 operator&(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator>=(const int4 a, const int b)
+{
+ return a >= make_int4(b);
+}
+
+ccl_device_inline int4 operator&(const int4 a, const int4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_and_si128(a.m128, b.m128));
@@ -92,6 +109,97 @@ ccl_device_inline int4 operator&(const int4 &a, const int4 &b)
# endif
}
+ccl_device_inline int4 operator|(const int4 a, const int4 b)
+{
+# ifdef __KERNEL_SSE__
+ return int4(_mm_or_si128(a.m128, b.m128));
+# else
+ return make_int4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w);
+# endif
+}
+
+ccl_device_inline int4 operator^(const int4 a, const int4 b)
+{
+# ifdef __KERNEL_SSE__
+ return int4(_mm_xor_si128(a.m128, b.m128));
+# else
+ return make_int4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+# endif
+}
+
+ccl_device_inline int4 operator&(const int32_t a, const int4 b)
+{
+ return make_int4(a) & b;
+}
+
+ccl_device_inline int4 operator&(const int4 a, const int32_t b)
+{
+ return a & make_int4(b);
+}
+
+ccl_device_inline int4 operator|(const int32_t a, const int4 b)
+{
+ return make_int4(a) | b;
+}
+
+ccl_device_inline int4 operator|(const int4 a, const int32_t b)
+{
+ return a | make_int4(b);
+}
+
+ccl_device_inline int4 operator^(const int32_t a, const int4 b)
+{
+ return make_int4(a) ^ b;
+}
+
+ccl_device_inline int4 operator^(const int4 a, const int32_t b)
+{
+ return a ^ make_int4(b);
+}
+
+ccl_device_inline int4 &operator&=(int4 &a, const int4 b)
+{
+ return a = a & b;
+}
+ccl_device_inline int4 &operator&=(int4 &a, const int32_t b)
+{
+ return a = a & b;
+}
+
+ccl_device_inline int4 &operator|=(int4 &a, const int4 b)
+{
+ return a = a | b;
+}
+ccl_device_inline int4 &operator|=(int4 &a, const int32_t b)
+{
+ return a = a | b;
+}
+
+ccl_device_inline int4 &operator^=(int4 &a, const int4 b)
+{
+ return a = a ^ b;
+}
+ccl_device_inline int4 &operator^=(int4 &a, const int32_t b)
+{
+ return a = a ^ b;
+}
+
+ccl_device_inline int4 &operator<<=(int4 &a, const int32_t b)
+{
+ return a = a << b;
+}
+ccl_device_inline int4 &operator>>=(int4 &a, const int32_t b)
+{
+ return a = a >> b;
+}
+
+# ifdef __KERNEL_SSE__
+ccl_device_forceinline const int4 srl(const int4 a, const int32_t b)
+{
+ return int4(_mm_srli_epi32(a.m128, b));
+}
+# endif
+
ccl_device_inline int4 min(int4 a, int4 b)
{
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
@@ -110,12 +218,12 @@ ccl_device_inline int4 max(int4 a, int4 b)
# endif
}
-ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx)
+ccl_device_inline int4 clamp(const int4 a, const int4 mn, const int4 mx)
{
return min(max(a, mn), mx);
}
-ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b)
+ccl_device_inline int4 select(const int4 mask, const int4 a, const int4 b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b)));
@@ -135,6 +243,52 @@ ccl_device_inline int4 load_int4(const int *v)
}
#endif /* __KERNEL_GPU__ */
+ccl_device_inline float4 cast(const int4 a)
+{
+#ifdef __KERNEL_SSE__
+ return float4(_mm_castsi128_ps(a));
+#else
+ return make_float4(
+ __int_as_float(a.x), __int_as_float(a.y), __int_as_float(a.z), __int_as_float(a.w));
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+ccl_device_forceinline int4 andnot(const int4 a, const int4 b)
+{
+ return int4(_mm_andnot_si128(a.m128, b.m128));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline int4 shuffle(const int4 a)
+{
+# ifdef __KERNEL_NEON__
+ int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a));
+ return int4(vreinterpretq_m128i_s32(result));
+# else
+ return int4(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
+# endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline int4 shuffle(const int4 a, const int4 b)
+{
+# ifdef __KERNEL_NEON__
+ int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a),
+ vreinterpretq_s32_m128i(b));
+ return int4(vreinterpretq_m128i_s32(result));
+# else
+ return int4(_mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+# endif
+}
+
+template<size_t i0> ccl_device_forceinline int4 shuffle(const int4 b)
+{
+ return shuffle<i0, i0, i0, i0>(b);
+}
+#endif
+
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_INT4_H__ */
diff --git a/intern/cycles/util/math_int8.h b/intern/cycles/util/math_int8.h
new file mode 100644
index 00000000000..d150b0b74ec
--- /dev/null
+++ b/intern/cycles/util/math_int8.h
@@ -0,0 +1,355 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef __UTIL_MATH_INT8_H__
+#define __UTIL_MATH_INT8_H__
+
+#ifndef __UTIL_MATH_H__
+# error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline vint8 operator+(const vint8 a, const vint8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_add_epi32(a.m256, b.m256));
+# else
+ return make_vint8(
+ a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
+# endif
+}
+
+ccl_device_inline vint8 operator+=(vint8 &a, const vint8 b)
+{
+ return a = a + b;
+}
+
+ccl_device_inline vint8 operator-(const vint8 a, const vint8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_sub_epi32(a.m256, b.m256));
+# else
+ return make_vint8(
+ a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
+# endif
+}
+
+ccl_device_inline vint8 operator-=(vint8 &a, const vint8 b)
+{
+ return a = a - b;
+}
+
+ccl_device_inline vint8 operator>>(const vint8 a, int i)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_srai_epi32(a.m256, i));
+# else
+ return make_vint8(
+ a.a >> i, a.b >> i, a.c >> i, a.d >> i, a.e >> i, a.f >> i, a.g >> i, a.h >> i);
+# endif
+}
+
+ccl_device_inline vint8 operator<<(const vint8 a, int i)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_slli_epi32(a.m256, i));
+# else
+ return make_vint8(
+ a.a << i, a.b << i, a.c << i, a.d << i, a.e << i, a.f << i, a.g << i, a.h << i);
+# endif
+}
+
+ccl_device_inline vint8 operator<(const vint8 a, const vint8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_cmpgt_epi32(b.m256, a.m256));
+# else
+ return make_vint8(
+ a.a < b.a, a.b < b.b, a.c < b.c, a.d < b.d, a.e < b.e, a.f < b.f, a.g < b.g, a.h < b.h);
+# endif
+}
+
+ccl_device_inline vint8 operator<(const vint8 a, const int b)
+{
+ return a < make_vint8(b);
+}
+
+ccl_device_inline vint8 operator==(const vint8 a, const vint8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_cmpeq_epi32(a.m256, b.m256));
+# else
+ return make_vint8(a.a == b.a,
+ a.b == b.b,
+ a.c == b.c,
+ a.d == b.d,
+ a.e == b.e,
+ a.f == b.f,
+ a.g == b.g,
+ a.h == b.h);
+# endif
+}
+
+ccl_device_inline vint8 operator==(const vint8 a, const int b)
+{
+ return a == make_vint8(b);
+}
+
+ccl_device_inline vint8 operator>=(const vint8 a, const vint8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(
+ _mm256_xor_si256(_mm256_set1_epi32(0xffffffff), _mm256_cmpgt_epi32(b.m256, a.m256)));
+# else
+ return make_vint8(a.a >= b.a,
+ a.b >= b.b,
+ a.c >= b.c,
+ a.d >= b.d,
+ a.e >= b.e,
+ a.f >= b.f,
+ a.g >= b.g,
+ a.h >= b.h);
+# endif
+}
+
+ccl_device_inline vint8 operator>=(const vint8 a, const int b)
+{
+ return a >= make_vint8(b);
+}
+
+ccl_device_inline vint8 operator&(const vint8 a, const vint8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_and_si256(a.m256, b.m256));
+# else
+ return make_vint8(
+ a.a & b.a, a.b & b.b, a.c & b.c, a.d & b.d, a.e & b.e, a.f & b.f, a.g & b.g, a.h & b.h);
+# endif
+}
+
+ccl_device_inline vint8 operator|(const vint8 a, const vint8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_or_si256(a.m256, b.m256));
+# else
+ return make_vint8(
+ a.a | b.a, a.b | b.b, a.c | b.c, a.d | b.d, a.e | b.e, a.f | b.f, a.g | b.g, a.h | b.h);
+# endif
+}
+
+ccl_device_inline vint8 operator^(const vint8 a, const vint8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_xor_si256(a.m256, b.m256));
+# else
+ return make_vint8(
+ a.a ^ b.a, a.b ^ b.b, a.c ^ b.c, a.d ^ b.d, a.e ^ b.e, a.f ^ b.f, a.g ^ b.g, a.h ^ b.h);
+# endif
+}
+
+ccl_device_inline vint8 operator&(const int32_t a, const vint8 b)
+{
+ return make_vint8(a) & b;
+}
+
+ccl_device_inline vint8 operator&(const vint8 a, const int32_t b)
+{
+ return a & make_vint8(b);
+}
+
+ccl_device_inline vint8 operator|(const int32_t a, const vint8 b)
+{
+ return make_vint8(a) | b;
+}
+
+ccl_device_inline vint8 operator|(const vint8 a, const int32_t b)
+{
+ return a | make_vint8(b);
+}
+
+ccl_device_inline vint8 operator^(const int32_t a, const vint8 b)
+{
+ return make_vint8(a) ^ b;
+}
+
+ccl_device_inline vint8 operator^(const vint8 a, const int32_t b)
+{
+ return a ^ make_vint8(b);
+}
+
+ccl_device_inline vint8 &operator&=(vint8 &a, const vint8 b)
+{
+ return a = a & b;
+}
+ccl_device_inline vint8 &operator&=(vint8 &a, const int32_t b)
+{
+ return a = a & b;
+}
+
+ccl_device_inline vint8 &operator|=(vint8 &a, const vint8 b)
+{
+ return a = a | b;
+}
+ccl_device_inline vint8 &operator|=(vint8 &a, const int32_t b)
+{
+ return a = a | b;
+}
+
+ccl_device_inline vint8 &operator^=(vint8 &a, const vint8 b)
+{
+ return a = a ^ b;
+}
+ccl_device_inline vint8 &operator^=(vint8 &a, const int32_t b)
+{
+ return a = a ^ b;
+}
+
+ccl_device_inline vint8 &operator<<=(vint8 &a, const int32_t b)
+{
+ return a = a << b;
+}
+ccl_device_inline vint8 &operator>>=(vint8 &a, const int32_t b)
+{
+ return a = a >> b;
+}
+
+# ifdef __KERNEL_AVX__
+ccl_device_forceinline const vint8 srl(const vint8 a, const int32_t b)
+{
+ return vint8(_mm256_srli_epi32(a.m256, b));
+}
+# endif
+
+ccl_device_inline vint8 min(vint8 a, vint8 b)
+{
+# if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__)
+ return vint8(_mm256_min_epi32(a.m256, b.m256));
+# else
+ return make_vint8(min(a.a, b.a),
+ min(a.b, b.b),
+ min(a.c, b.c),
+ min(a.d, b.d),
+ min(a.e, b.e),
+ min(a.f, b.f),
+ min(a.g, b.g),
+ min(a.h, b.h));
+# endif
+}
+
+ccl_device_inline vint8 max(vint8 a, vint8 b)
+{
+# if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__)
+ return vint8(_mm256_max_epi32(a.m256, b.m256));
+# else
+ return make_vint8(max(a.a, b.a),
+ max(a.b, b.b),
+ max(a.c, b.c),
+ max(a.d, b.d),
+ max(a.e, b.e),
+ max(a.f, b.f),
+ max(a.g, b.g),
+ max(a.h, b.h));
+# endif
+}
+
+ccl_device_inline vint8 clamp(const vint8 a, const vint8 mn, const vint8 mx)
+{
+ return min(max(a, mn), mx);
+}
+
+ccl_device_inline vint8 select(const vint8 mask, const vint8 a, const vint8 b)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_castps_si256(_mm256_blendv_ps(
+ _mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))));
+# else
+ return make_vint8((mask.a) ? a.a : b.a,
+ (mask.b) ? a.b : b.b,
+ (mask.c) ? a.c : b.c,
+ (mask.d) ? a.d : b.d,
+ (mask.e) ? a.e : b.e,
+ (mask.f) ? a.f : b.f,
+ (mask.g) ? a.g : b.g,
+ (mask.h) ? a.h : b.h);
+# endif
+}
+
+ccl_device_inline vint8 load_vint8(const int *v)
+{
+# ifdef __KERNEL_AVX__
+ return vint8(_mm256_loadu_si256((__m256i *)v));
+# else
+ return make_vint8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+# endif
+}
+#endif /* __KERNEL_GPU__ */
+
+ccl_device_inline vfloat8 cast(const vint8 a)
+{
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_castsi256_ps(a));
+#else
+ return make_vfloat8(__int_as_float(a.a),
+ __int_as_float(a.b),
+ __int_as_float(a.c),
+ __int_as_float(a.d),
+ __int_as_float(a.e),
+ __int_as_float(a.f),
+ __int_as_float(a.g),
+ __int_as_float(a.h));
+#endif
+}
+
+#ifdef __KERNEL_AVX__
+template<size_t i> ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+ return vint8(
+ _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i))));
+}
+
+template<size_t i0, size_t i1> ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+ return vint8(_mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0)));
+}
+
+template<size_t i0, size_t i1>
+ccl_device_forceinline const vint8 shuffle(const vint8 a, const vint8 b)
+{
+ return vint8(_mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+ return vint8(
+ _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0))));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vint8 shuffle(const vint8 a, const vint8 b)
+{
+ return vint8(_mm256_castps_si256(_mm256_shuffle_ps(
+ _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+}
+
+template<> __forceinline const vint8 shuffle<0, 0, 2, 2>(const vint8 b)
+{
+ return vint8(_mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b))));
+}
+template<> __forceinline const vint8 shuffle<1, 1, 3, 3>(const vint8 b)
+{
+ return vint8(_mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b))));
+}
+template<> __forceinline const vint8 shuffle<0, 1, 0, 1>(const vint8 b)
+{
+ return vint8(_mm256_castps_si256(
+ _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b))))));
+}
+#endif
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT8_H__ */
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index aa28682f8c1..0727debf775 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -133,7 +133,9 @@ ccl_device_forceinline float ray_triangle_rcp(const float x)
ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
- return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
+ return madd(make_float4(a.x),
+ make_float4(b.x),
+ madd(make_float4(a.y), make_float4(b.y), make_float4(a.z) * make_float4(b.z)))[0];
#else
return a.x * b.x + a.y * b.y + a.z * b.z;
#endif
@@ -142,9 +144,10 @@ ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
- return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
- msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
- msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
+ return make_float3(
+ msub(make_float4(a.y), make_float4(b.z), make_float4(a.z) * make_float4(b.y))[0],
+ msub(make_float4(a.z), make_float4(b.x), make_float4(a.x) * make_float4(b.z))[0],
+ msub(make_float4(a.x), make_float4(b.y), make_float4(a.y) * make_float4(b.x))[0]);
#else
return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
#endif
diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp
index 17cff2f2977..cb6b8d7a740 100644
--- a/intern/cycles/util/path.cpp
+++ b/intern/cycles/util/path.cpp
@@ -2,8 +2,11 @@
* Copyright 2011-2022 Blender Foundation */
#include "util/path.h"
+#include "util/algorithm.h"
+#include "util/map.h"
#include "util/md5.h"
#include "util/string.h"
+#include "util/vector.h"
#include <OpenImageIO/filesystem.h>
#include <OpenImageIO/strutil.h>
@@ -898,19 +901,54 @@ FILE *path_fopen(const string &path, const string &mode)
#endif
}
-void path_cache_clear_except(const string &name, const set<string> &except)
+/* LRU Cache for Kernels */
+
+static void path_cache_kernel_mark_used(const string &path)
{
- string dir = path_user_get("cache");
+ std::time_t current_time = std::time(nullptr);
+ OIIO::Filesystem::last_write_time(path, current_time);
+}
- if (path_exists(dir)) {
- directory_iterator it(dir), it_end;
+bool path_cache_kernel_exists_and_mark_used(const string &path)
+{
+ if (path_exists(path)) {
+ path_cache_kernel_mark_used(path);
+ return true;
+ }
+ else {
+ return false;
+ }
+}
- for (; it != it_end; ++it) {
- string filename = path_filename(it->path());
+void path_cache_kernel_mark_added_and_clear_old(const string &new_path,
+ const size_t max_old_kernel_of_same_type)
+{
+ path_cache_kernel_mark_used(new_path);
+
+ string dir = path_dirname(new_path);
+ if (!path_exists(dir)) {
+ return;
+ }
+
+ /* Remove older kernels within the same directory. */
+ directory_iterator it(dir), it_end;
+ vector<pair<std::time_t, string>> same_kernel_types;
+
+ for (; it != it_end; ++it) {
+ const string &path = it->path();
+ if (path == new_path) {
+ continue;
+ }
+
+ std::time_t last_time = OIIO::Filesystem::last_write_time(path);
+ same_kernel_types.emplace_back(last_time, path);
+ }
+
+ if (same_kernel_types.size() > max_old_kernel_of_same_type) {
+ sort(same_kernel_types.begin(), same_kernel_types.end());
- if (string_startswith(filename, name.c_str()))
- if (except.find(filename) == except.end())
- path_remove(it->path());
+ for (int i = 0; i < same_kernel_types.size() - max_old_kernel_of_same_type; i++) {
+ path_remove(same_kernel_types[i].second);
}
}
}
diff --git a/intern/cycles/util/path.h b/intern/cycles/util/path.h
index 48b1fb65919..6d02267e182 100644
--- a/intern/cycles/util/path.h
+++ b/intern/cycles/util/path.h
@@ -55,8 +55,15 @@ bool path_remove(const string &path);
/* source code utility */
string path_source_replace_includes(const string &source, const string &path);
-/* cache utility */
-void path_cache_clear_except(const string &name, const set<string> &except);
+/* Simple least-recently-used cache for kernels.
+ *
+ * Kernels of same type are cached in the same directory.
+ * Whenever a kernel is used, its last modified time is updated.
+ * When a new kernel is added to the cache, clear old entries of the same type (i.e. in the same
+ * directory). */
+bool path_cache_kernel_exists_and_mark_used(const string &path);
+void path_cache_kernel_mark_added_and_clear_old(const string &path,
+ const size_t max_old_kernel_of_same_type = 5);
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/sseb.h b/intern/cycles/util/sseb.h
deleted file mode 100644
index 6f78299711e..00000000000
--- a/intern/cycles/util/sseb.h
+++ /dev/null
@@ -1,345 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEB_H__
-#define __UTIL_SSEB_H__
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct ssei;
-struct ssef;
-
-/*! 4-wide SSE bool type. */
-struct sseb {
- typedef sseb Mask; // mask type
- typedef ssei Int; // int type
- typedef ssef Float; // float type
-
- enum { size = 4 }; // number of SIMD elements
- union {
- __m128 m128;
- int32_t v[4];
- }; // data
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Constructors, Assignment & Cast Operators
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline sseb()
- {
- }
- __forceinline sseb(const sseb &other)
- {
- m128 = other.m128;
- }
- __forceinline sseb &operator=(const sseb &other)
- {
- m128 = other.m128;
- return *this;
- }
-
- __forceinline sseb(const __m128 input) : m128(input)
- {
- }
- __forceinline operator const __m128 &(void) const
- {
- return m128;
- }
- __forceinline operator const __m128i(void) const
- {
- return _mm_castps_si128(m128);
- }
- __forceinline operator const __m128d(void) const
- {
- return _mm_castps_pd(m128);
- }
-
- __forceinline sseb(bool a)
- : m128(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)])
- {
- }
- __forceinline sseb(bool a, bool b)
- : m128(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)])
- {
- }
- __forceinline sseb(bool a, bool b, bool c, bool d)
- : m128(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)])
- {
- }
- __forceinline sseb(int mask)
- {
- assert(mask >= 0 && mask < 16);
- m128 = _mm_lookupmask_ps[mask];
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Constants
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline sseb(FalseTy) : m128(_mm_setzero_ps())
- {
- }
- __forceinline sseb(TrueTy)
- : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())))
- {
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Array Access
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline bool operator[](const size_t i) const
- {
- assert(i < 4);
- return (_mm_movemask_ps(m128) >> i) & 1;
- }
- __forceinline int32_t &operator[](const size_t i)
- {
- assert(i < 4);
- return v[i];
- }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator!(const sseb &a)
-{
- return _mm_xor_ps(a, sseb(True));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator&(const sseb &a, const sseb &b)
-{
- return _mm_and_ps(a, b);
-}
-__forceinline const sseb operator|(const sseb &a, const sseb &b)
-{
- return _mm_or_ps(a, b);
-}
-__forceinline const sseb operator^(const sseb &a, const sseb &b)
-{
- return _mm_xor_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator&=(sseb &a, const sseb &b)
-{
- return a = a & b;
-}
-__forceinline const sseb operator|=(sseb &a, const sseb &b)
-{
- return a = a | b;
-}
-__forceinline const sseb operator^=(sseb &a, const sseb &b)
-{
- return a = a ^ b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator!=(const sseb &a, const sseb &b)
-{
- return _mm_xor_ps(a, b);
-}
-__forceinline const sseb operator==(const sseb &a, const sseb &b)
-{
- return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b));
-}
-
-__forceinline const sseb select(const sseb &m, const sseb &t, const sseb &f)
-{
-# if defined(__KERNEL_SSE41__)
- return _mm_blendv_ps(f, t, m);
-# else
- return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
-# endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb unpacklo(const sseb &a, const sseb &b)
-{
- return _mm_unpacklo_ps(a, b);
-}
-__forceinline const sseb unpackhi(const sseb &a, const sseb &b)
-{
- return _mm_unpackhi_ps(a, b);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const sseb shuffle(const sseb &a)
-{
-# ifdef __KERNEL_NEON__
- return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a);
-# else
- return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
-# endif
-}
-
-# ifndef __KERNEL_NEON__
-template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a)
-{
- return _mm_movelh_ps(a, a);
-}
-
-template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a)
-{
- return _mm_movehl_ps(a, a);
-}
-# endif
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const sseb shuffle(const sseb &a, const sseb &b)
-{
-# ifdef __KERNEL_NEON__
- return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a, b);
-# else
- return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-# endif
-}
-
-# ifndef __KERNEL_NEON__
-template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b)
-{
- return _mm_movelh_ps(a, b);
-}
-
-template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sseb &b)
-{
- return _mm_movehl_ps(b, a);
-}
-# endif
-
-# if defined(__KERNEL_SSE3__) && !defined(__KERNEL_NEON__)
-template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a)
-{
- return _mm_moveldup_ps(a);
-}
-template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a)
-{
- return _mm_movehdup_ps(a);
-}
-# endif
-
-# if defined(__KERNEL_SSE41__)
-template<size_t dst, size_t src, size_t clr>
-__forceinline const sseb insert(const sseb &a, const sseb &b)
-{
-# ifdef __KERNEL_NEON__
- sseb res = a;
- if (clr)
- res[dst] = 0;
- else
- res[dst] = b[src];
- return res;
-# else
- return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
-# endif
-}
-template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b)
-{
- return insert<dst, src, 0>(a, b);
-}
-template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b)
-{
- return insert<dst, 0>(a, sseb(b));
-}
-# endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reduction Operations
-////////////////////////////////////////////////////////////////////////////////
-
-# if defined(__KERNEL_SSE41__)
-__forceinline uint32_t popcnt(const sseb &a)
-{
-# if defined(__KERNEL_NEON__)
- const int32x4_t mask = {1, 1, 1, 1};
- int32x4_t t = vandq_s32(vreinterpretq_s32_m128(a.m128), mask);
- return vaddvq_s32(t);
-# else
- return _mm_popcnt_u32(_mm_movemask_ps(a));
-# endif
-}
-# else
-__forceinline uint32_t popcnt(const sseb &a)
-{
- return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]);
-}
-# endif
-
-__forceinline bool reduce_and(const sseb &a)
-{
-# if defined(__KERNEL_NEON__)
- return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) == -4;
-# else
- return _mm_movemask_ps(a) == 0xf;
-# endif
-}
-__forceinline bool reduce_or(const sseb &a)
-{
-# if defined(__KERNEL_NEON__)
- return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) != 0x0;
-# else
- return _mm_movemask_ps(a) != 0x0;
-# endif
-}
-__forceinline bool all(const sseb &b)
-{
-# if defined(__KERNEL_NEON__)
- return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == -4;
-# else
- return _mm_movemask_ps(b) == 0xf;
-# endif
-}
-__forceinline bool any(const sseb &b)
-{
-# if defined(__KERNEL_NEON__)
- return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) != 0x0;
-# else
- return _mm_movemask_ps(b) != 0x0;
-# endif
-}
-__forceinline bool none(const sseb &b)
-{
-# if defined(__KERNEL_NEON__)
- return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == 0x0;
-# else
- return _mm_movemask_ps(b) == 0x0;
-# endif
-}
-
-__forceinline uint32_t movemask(const sseb &a)
-{
- return _mm_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_sseb(const char *label, const sseb &a)
-{
- printf("%s: %d %d %d %d\n", label, a[0], a[1], a[2], a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h
deleted file mode 100644
index 1e2bfa90354..00000000000
--- a/intern/cycles/util/ssef.h
+++ /dev/null
@@ -1,1090 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEF_H__
-#define __UTIL_SSEF_H__
-
-#include <math.h>
-
-#include "util/ssei.h"
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct sseb;
-struct ssef;
-
-/*! 4-wide SSE float type. */
-struct ssef {
- typedef sseb Mask; // mask type
- typedef ssei Int; // int type
- typedef ssef Float; // float type
-
- enum { size = 4 }; // number of SIMD elements
- union {
- __m128 m128;
- float f[4];
- int i[4];
- }; // data
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Constructors, Assignment & Cast Operators
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline ssef()
- {
- }
- __forceinline ssef(const ssef &other)
- {
- m128 = other.m128;
- }
- __forceinline ssef &operator=(const ssef &other)
- {
- m128 = other.m128;
- return *this;
- }
-
- __forceinline ssef(const __m128 a) : m128(a)
- {
- }
- __forceinline operator const __m128 &() const
- {
- return m128;
- }
- __forceinline operator __m128 &()
- {
- return m128;
- }
-
- __forceinline ssef(float a) : m128(_mm_set1_ps(a))
- {
- }
- __forceinline ssef(float a, float b, float c, float d) : m128(_mm_setr_ps(a, b, c, d))
- {
- }
-
- __forceinline explicit ssef(const __m128i a) : m128(_mm_cvtepi32_ps(a))
- {
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Loads and Stores
- ////////////////////////////////////////////////////////////////////////////////
-
-# if defined(__KERNEL_AVX__)
- static __forceinline ssef broadcast(const void *const a)
- {
- return _mm_broadcast_ss((float *)a);
- }
-# else
- static __forceinline ssef broadcast(const void *const a)
- {
- return _mm_set1_ps(*(float *)a);
- }
-# endif
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Array Access
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline const float &operator[](const size_t i) const
- {
- assert(i < 4);
- return f[i];
- }
- __forceinline float &operator[](const size_t i)
- {
- assert(i < 4);
- return f[i];
- }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef cast(const __m128i &a)
-{
- return _mm_castsi128_ps(a);
-}
-__forceinline const ssef operator+(const ssef &a)
-{
- return a;
-}
-__forceinline const ssef operator-(const ssef &a)
-{
- return _mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
-}
-__forceinline const ssef abs(const ssef &a)
-{
- return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
-}
-# if defined(__KERNEL_SSE41__)
-__forceinline const ssef sign(const ssef &a)
-{
- return _mm_blendv_ps(ssef(1.0f), -ssef(1.0f), _mm_cmplt_ps(a, ssef(0.0f)));
-}
-# endif
-__forceinline const ssef signmsk(const ssef &a)
-{
- return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
-}
-
-__forceinline const ssef rcp(const ssef &a)
-{
- const ssef r = _mm_rcp_ps(a.m128);
- return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-}
-__forceinline const ssef sqr(const ssef &a)
-{
- return _mm_mul_ps(a, a);
-}
-__forceinline const ssef mm_sqrt(const ssef &a)
-{
- return _mm_sqrt_ps(a.m128);
-}
-__forceinline const ssef rsqrt(const ssef &a)
-{
- const ssef r = _mm_rsqrt_ps(a.m128);
- return _mm_add_ps(
- _mm_mul_ps(_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f), r),
- _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set_ps(-0.5f, -0.5f, -0.5f, -0.5f)), r),
- _mm_mul_ps(r, r)));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef operator+(const ssef &a, const ssef &b)
-{
- return _mm_add_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator+(const ssef &a, const float &b)
-{
- return a + ssef(b);
-}
-__forceinline const ssef operator+(const float &a, const ssef &b)
-{
- return ssef(a) + b;
-}
-
-__forceinline const ssef operator-(const ssef &a, const ssef &b)
-{
- return _mm_sub_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator-(const ssef &a, const float &b)
-{
- return a - ssef(b);
-}
-__forceinline const ssef operator-(const float &a, const ssef &b)
-{
- return ssef(a) - b;
-}
-
-__forceinline const ssef operator*(const ssef &a, const ssef &b)
-{
- return _mm_mul_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator*(const ssef &a, const float &b)
-{
- return a * ssef(b);
-}
-__forceinline const ssef operator*(const float &a, const ssef &b)
-{
- return ssef(a) * b;
-}
-
-__forceinline const ssef operator/(const ssef &a, const ssef &b)
-{
- return _mm_div_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator/(const ssef &a, const float &b)
-{
- return a / ssef(b);
-}
-__forceinline const ssef operator/(const float &a, const ssef &b)
-{
- return ssef(a) / b;
-}
-
-__forceinline const ssef operator^(const ssef &a, const ssef &b)
-{
- return _mm_xor_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator^(const ssef &a, const ssei &b)
-{
- return _mm_xor_ps(a.m128, _mm_castsi128_ps(b.m128));
-}
-
-__forceinline const ssef operator&(const ssef &a, const ssef &b)
-{
- return _mm_and_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator&(const ssef &a, const ssei &b)
-{
- return _mm_and_ps(a.m128, _mm_castsi128_ps(b.m128));
-}
-
-__forceinline const ssef operator|(const ssef &a, const ssef &b)
-{
- return _mm_or_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator|(const ssef &a, const ssei &b)
-{
- return _mm_or_ps(a.m128, _mm_castsi128_ps(b.m128));
-}
-
-__forceinline const ssef andnot(const ssef &a, const ssef &b)
-{
- return _mm_andnot_ps(a.m128, b.m128);
-}
-
-__forceinline const ssef min(const ssef &a, const ssef &b)
-{
- return _mm_min_ps(a.m128, b.m128);
-}
-__forceinline const ssef min(const ssef &a, const float &b)
-{
- return _mm_min_ps(a.m128, ssef(b));
-}
-__forceinline const ssef min(const float &a, const ssef &b)
-{
- return _mm_min_ps(ssef(a), b.m128);
-}
-
-__forceinline const ssef max(const ssef &a, const ssef &b)
-{
- return _mm_max_ps(a.m128, b.m128);
-}
-__forceinline const ssef max(const ssef &a, const float &b)
-{
- return _mm_max_ps(a.m128, ssef(b));
-}
-__forceinline const ssef max(const float &a, const ssef &b)
-{
- return _mm_max_ps(ssef(a), b.m128);
-}
-
-# if defined(__KERNEL_SSE41__)
-__forceinline ssef mini(const ssef &a, const ssef &b)
-{
- const ssei ai = _mm_castps_si128(a);
- const ssei bi = _mm_castps_si128(b);
- const ssei ci = _mm_min_epi32(ai, bi);
- return _mm_castsi128_ps(ci);
-}
-# endif
-
-# if defined(__KERNEL_SSE41__)
-__forceinline ssef maxi(const ssef &a, const ssef &b)
-{
- const ssei ai = _mm_castps_si128(a);
- const ssei bi = _mm_castps_si128(b);
- const ssei ci = _mm_max_epi32(ai, bi);
- return _mm_castsi128_ps(ci);
-}
-# endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Ternary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
-{
-# if defined(__KERNEL_NEON__)
- return vfmaq_f32(c, a, b);
-# elif defined(__KERNEL_AVX2__)
- return _mm_fmadd_ps(a, b, c);
-# else
- return a * b + c;
-# endif
-}
-__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
-{
-# if defined(__KERNEL_NEON__)
- return vfmaq_f32(vnegq_f32(c), a, b);
-# elif defined(__KERNEL_AVX2__)
- return _mm_fmsub_ps(a, b, c);
-# else
- return a * b - c;
-# endif
-}
-__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
-{
-# if defined(__KERNEL_NEON__)
- return vfmsq_f32(c, a, b);
-# elif defined(__KERNEL_AVX2__)
- return _mm_fnmadd_ps(a, b, c);
-# else
- return c - a * b;
-# endif
-}
-__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
-{
-# if defined(__KERNEL_NEON__)
- return vfmsq_f32(vnegq_f32(c), a, b);
-# elif defined(__KERNEL_AVX2__)
- return _mm_fnmsub_ps(a, b, c);
-# else
- return -a * b - c;
-# endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef &operator+=(ssef &a, const ssef &b)
-{
- return a = a + b;
-}
-__forceinline ssef &operator+=(ssef &a, const float &b)
-{
- return a = a + b;
-}
-
-__forceinline ssef &operator-=(ssef &a, const ssef &b)
-{
- return a = a - b;
-}
-__forceinline ssef &operator-=(ssef &a, const float &b)
-{
- return a = a - b;
-}
-
-__forceinline ssef &operator*=(ssef &a, const ssef &b)
-{
- return a = a * b;
-}
-__forceinline ssef &operator*=(ssef &a, const float &b)
-{
- return a = a * b;
-}
-
-__forceinline ssef &operator/=(ssef &a, const ssef &b)
-{
- return a = a / b;
-}
-__forceinline ssef &operator/=(ssef &a, const float &b)
-{
- return a = a / b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator==(const ssef &a, const ssef &b)
-{
- return _mm_cmpeq_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator==(const ssef &a, const float &b)
-{
- return a == ssef(b);
-}
-__forceinline const sseb operator==(const float &a, const ssef &b)
-{
- return ssef(a) == b;
-}
-
-__forceinline const sseb operator!=(const ssef &a, const ssef &b)
-{
- return _mm_cmpneq_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator!=(const ssef &a, const float &b)
-{
- return a != ssef(b);
-}
-__forceinline const sseb operator!=(const float &a, const ssef &b)
-{
- return ssef(a) != b;
-}
-
-__forceinline const sseb operator<(const ssef &a, const ssef &b)
-{
- return _mm_cmplt_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator<(const ssef &a, const float &b)
-{
- return a < ssef(b);
-}
-__forceinline const sseb operator<(const float &a, const ssef &b)
-{
- return ssef(a) < b;
-}
-
-__forceinline const sseb operator>=(const ssef &a, const ssef &b)
-{
- return _mm_cmpnlt_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator>=(const ssef &a, const float &b)
-{
- return a >= ssef(b);
-}
-__forceinline const sseb operator>=(const float &a, const ssef &b)
-{
- return ssef(a) >= b;
-}
-
-__forceinline const sseb operator>(const ssef &a, const ssef &b)
-{
- return _mm_cmpnle_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator>(const ssef &a, const float &b)
-{
- return a > ssef(b);
-}
-__forceinline const sseb operator>(const float &a, const ssef &b)
-{
- return ssef(a) > b;
-}
-
-__forceinline const sseb operator<=(const ssef &a, const ssef &b)
-{
- return _mm_cmple_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator<=(const ssef &a, const float &b)
-{
- return a <= ssef(b);
-}
-__forceinline const sseb operator<=(const float &a, const ssef &b)
-{
- return ssef(a) <= b;
-}
-
-__forceinline const ssef select(const sseb &m, const ssef &t, const ssef &f)
-{
-# ifdef __KERNEL_SSE41__
- return _mm_blendv_ps(f, t, m);
-# else
- return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
-# endif
-}
-
-__forceinline const ssef select(const ssef &m, const ssef &t, const ssef &f)
-{
-# ifdef __KERNEL_SSE41__
- return _mm_blendv_ps(f, t, m);
-# else
- return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
-# endif
-}
-
-__forceinline const ssef select(const int mask, const ssef &t, const ssef &f)
-{
-# if defined(__KERNEL_SSE41__) && \
- ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
- return _mm_blend_ps(f, t, mask);
-# else
- return select(sseb(mask), t, f);
-# endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Rounding Functions
-////////////////////////////////////////////////////////////////////////////////
-
-# if defined(__KERNEL_SSE41__)
-__forceinline const ssef round_even(const ssef &a)
-{
-# ifdef __KERNEL_NEON__
- return vrndnq_f32(a);
-# else
- return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
-# endif
-}
-__forceinline const ssef round_down(const ssef &a)
-{
-# ifdef __KERNEL_NEON__
- return vrndmq_f32(a);
-# else
- return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
-# endif
-}
-__forceinline const ssef round_up(const ssef &a)
-{
-# ifdef __KERNEL_NEON__
- return vrndpq_f32(a);
-# else
- return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
-# endif
-}
-__forceinline const ssef round_zero(const ssef &a)
-{
-# ifdef __KERNEL_NEON__
- return vrndq_f32(a);
-# else
- return _mm_round_ps(a, _MM_FROUND_TO_ZERO);
-# endif
-}
-__forceinline const ssef floor(const ssef &a)
-{
-# ifdef __KERNEL_NEON__
- return vrndmq_f32(a);
-# else
- return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
-# endif
-}
-__forceinline const ssef ceil(const ssef &a)
-{
-# ifdef __KERNEL_NEON__
- return vrndpq_f32(a);
-# else
- return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
-# endif
-}
-# else
-/* Non-SSE4.1 fallback, needed for floorfrac. */
-__forceinline const ssef floor(const ssef &a)
-{
- return _mm_set_ps(floorf(a.f[3]), floorf(a.f[2]), floorf(a.f[1]), floorf(a.f[0]));
-}
-# endif
-
-__forceinline ssei truncatei(const ssef &a)
-{
- return _mm_cvttps_epi32(a.m128);
-}
-
-__forceinline ssef floorfrac(const ssef &x, ssei *i)
-{
- ssef f = floor(x);
- *i = truncatei(f);
- return x - f;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Common Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef mix(const ssef &a, const ssef &b, const ssef &t)
-{
- return madd(t, b, (ssef(1.0f) - t) * a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef unpacklo(const ssef &a, const ssef &b)
-{
- return _mm_unpacklo_ps(a.m128, b.m128);
-}
-__forceinline ssef unpackhi(const ssef &a, const ssef &b)
-{
- return _mm_unpackhi_ps(a.m128, b.m128);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssef shuffle(const ssef &b)
-{
-# ifdef __KERNEL_NEON__
- return shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128);
-# else
- return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-# endif
-}
-
-template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a)
-{
- return _mm_movelh_ps(a, a);
-}
-
-template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a)
-{
- return _mm_movehl_ps(a, a);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssef shuffle(const ssef &a, const ssef &b)
-{
-# ifdef __KERNEL_NEON__
- return shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b);
-# else
- return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-# endif
-}
-
-template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b)
-{
-# ifdef __KERNEL_NEON__
- return shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b);
-# else
- return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
-# endif
-}
-
-# ifndef __KERNEL_NEON__
-template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b)
-{
- return _mm_movelh_ps(a, b);
-}
-
-template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const ssef &b)
-{
- return _mm_movehl_ps(b, a);
-}
-# endif
-
-# if defined(__KERNEL_SSSE3__)
-__forceinline const ssef shuffle8(const ssef &a, const ssei &shuf)
-{
- return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
-}
-# endif
-
-# if defined(__KERNEL_SSE3__)
-template<> __forceinline const ssef shuffle<0, 0, 2, 2>(const ssef &b)
-{
- return _mm_moveldup_ps(b);
-}
-template<> __forceinline const ssef shuffle<1, 1, 3, 3>(const ssef &b)
-{
- return _mm_movehdup_ps(b);
-}
-# endif
-
-template<size_t i0> __forceinline const ssef shuffle(const ssef &b)
-{
- return shuffle<i0, i0, i0, i0>(b);
-}
-
-# if defined(__KERNEL_AVX__)
-__forceinline const ssef shuffle(const ssef &a, const ssei &shuf)
-{
- return _mm_permutevar_ps(a, shuf);
-}
-# endif
-
-template<size_t i> __forceinline float extract(const ssef &a)
-{
- return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
-}
-template<> __forceinline float extract<0>(const ssef &a)
-{
- return _mm_cvtss_f32(a);
-}
-
-# if defined(__KERNEL_SSE41__)
-template<size_t dst, size_t src, size_t clr>
-__forceinline const ssef insert(const ssef &a, const ssef &b)
-{
-# ifdef __KERNEL_NEON__
- ssef res = a;
- if (clr)
- res[dst] = 0;
- else
- res[dst] = b[src];
- return res;
-# else
- return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
-# endif
-}
-template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b)
-{
- return insert<dst, src, 0>(a, b);
-}
-template<size_t dst> __forceinline const ssef insert(const ssef &a, const float b)
-{
- return insert<dst, 0>(a, _mm_set_ss(b));
-}
-# else
-template<size_t dst> __forceinline const ssef insert(const ssef &a, const float b)
-{
- ssef c = a;
- c[dst] = b;
- return c;
-}
-# endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Transpose
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline void transpose(const ssef &r0,
- const ssef &r1,
- const ssef &r2,
- const ssef &r3,
- ssef &c0,
- ssef &c1,
- ssef &c2,
- ssef &c3)
-{
- ssef l02 = unpacklo(r0, r2);
- ssef h02 = unpackhi(r0, r2);
- ssef l13 = unpacklo(r1, r3);
- ssef h13 = unpackhi(r1, r3);
- c0 = unpacklo(l02, l13);
- c1 = unpackhi(l02, l13);
- c2 = unpacklo(h02, h13);
- c3 = unpackhi(h02, h13);
-}
-
-__forceinline void transpose(
- const ssef &r0, const ssef &r1, const ssef &r2, const ssef &r3, ssef &c0, ssef &c1, ssef &c2)
-{
- ssef l02 = unpacklo(r0, r2);
- ssef h02 = unpackhi(r0, r2);
- ssef l13 = unpacklo(r1, r3);
- ssef h13 = unpackhi(r1, r3);
- c0 = unpacklo(l02, l13);
- c1 = unpackhi(l02, l13);
- c2 = unpacklo(h02, h13);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef vreduce_min(const ssef &v)
-{
-# ifdef __KERNEL_NEON__
- return vdupq_n_f32(vminvq_f32(v));
-# else
- ssef h = min(shuffle<1, 0, 3, 2>(v), v);
- return min(shuffle<2, 3, 0, 1>(h), h);
-# endif
-}
-__forceinline const ssef vreduce_max(const ssef &v)
-{
-# ifdef __KERNEL_NEON__
- return vdupq_n_f32(vmaxvq_f32(v));
-# else
- ssef h = max(shuffle<1, 0, 3, 2>(v), v);
- return max(shuffle<2, 3, 0, 1>(h), h);
-# endif
-}
-__forceinline const ssef vreduce_add(const ssef &v)
-{
-# ifdef __KERNEL_NEON__
- return vdupq_n_f32(vaddvq_f32(v));
-# else
- ssef h = shuffle<1, 0, 3, 2>(v) + v;
- return shuffle<2, 3, 0, 1>(h) + h;
-# endif
-}
-
-__forceinline float reduce_min(const ssef &v)
-{
-# ifdef __KERNEL_NEON__
- return vminvq_f32(v);
-# else
- return _mm_cvtss_f32(vreduce_min(v));
-# endif
-}
-__forceinline float reduce_max(const ssef &v)
-{
-# ifdef __KERNEL_NEON__
- return vmaxvq_f32(v);
-# else
- return _mm_cvtss_f32(vreduce_max(v));
-# endif
-}
-__forceinline float reduce_add(const ssef &v)
-{
-# ifdef __KERNEL_NEON__
- return vaddvq_f32(v);
-# else
- return _mm_cvtss_f32(vreduce_add(v));
-# endif
-}
-
-__forceinline uint32_t select_min(const ssef &v)
-{
- return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const ssef &v)
-{
- return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const sseb &valid, const ssef &v)
-{
- const ssef a = select(valid, v, ssef(pos_inf));
- return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const sseb &valid, const ssef &v)
-{
- const ssef a = select(valid, v, ssef(neg_inf));
- return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-__forceinline uint32_t movemask(const ssef &a)
-{
- return _mm_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Memory load and store operations
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef load4f(const float4 &a)
-{
-# ifdef __KERNEL_WITH_SSE_ALIGN__
- return _mm_load_ps(&a.x);
-# else
- return _mm_loadu_ps(&a.x);
-# endif
-}
-
-__forceinline ssef load4f(const float3 &a)
-{
-# ifdef __KERNEL_WITH_SSE_ALIGN__
- return _mm_load_ps(&a.x);
-# else
- return _mm_loadu_ps(&a.x);
-# endif
-}
-
-__forceinline ssef load4f(const void *const a)
-{
- return _mm_load_ps((float *)a);
-}
-
-__forceinline ssef load1f_first(const float a)
-{
- return _mm_set_ss(a);
-}
-
-__forceinline void store4f(void *ptr, const ssef &v)
-{
- _mm_store_ps((float *)ptr, v);
-}
-
-__forceinline ssef loadu4f(const void *const a)
-{
- return _mm_loadu_ps((float *)a);
-}
-
-__forceinline void storeu4f(void *ptr, const ssef &v)
-{
- _mm_storeu_ps((float *)ptr, v);
-}
-
-__forceinline void store4f(const sseb &mask, void *ptr, const ssef &f)
-{
-# if defined(__KERNEL_AVX__)
- _mm_maskstore_ps((float *)ptr, (__m128i)mask, f);
-# else
- *(ssef *)ptr = select(mask, f, *(ssef *)ptr);
-# endif
-}
-
-__forceinline ssef load4f_nt(void *ptr)
-{
-# if defined(__KERNEL_SSE41__)
- return _mm_castsi128_ps(_mm_stream_load_si128((__m128i *)ptr));
-# else
- return _mm_load_ps((float *)ptr);
-# endif
-}
-
-__forceinline void store4f_nt(void *ptr, const ssef &v)
-{
-# if defined(__KERNEL_SSE41__)
- _mm_stream_ps((float *)ptr, v);
-# else
- _mm_store_ps((float *)ptr, v);
-# endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Euclidean Space Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline float dot(const ssef &a, const ssef &b)
-{
- return reduce_add(a * b);
-}
-
-/* calculate shuffled cross product, useful when order of components does not matter */
-__forceinline ssef cross_zxy(const ssef &a, const ssef &b)
-{
- const ssef a0 = a;
- const ssef b0 = shuffle<1, 2, 0, 3>(b);
- const ssef a1 = shuffle<1, 2, 0, 3>(a);
- const ssef b1 = b;
- return msub(a0, b0, a1 * b1);
-}
-
-__forceinline ssef cross(const ssef &a, const ssef &b)
-{
- return shuffle<1, 2, 0, 3>(cross_zxy(a, b));
-}
-
-ccl_device_inline const ssef dot3_splat(const ssef &a, const ssef &b)
-{
-# ifdef __KERNEL_SSE41__
- return _mm_dp_ps(a.m128, b.m128, 0x7f);
-# else
- ssef t = a * b;
- return ssef(((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2]);
-# endif
-}
-
-/* squared length taking only specified axes into account */
-template<size_t X, size_t Y, size_t Z, size_t W> ccl_device_inline float len_squared(const ssef &a)
-{
-# ifndef __KERNEL_SSE41__
- float4 &t = (float4 &)a;
- return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) +
- (W ? t.w * t.w : 0.0f);
-# else
- return extract<0>(
- ssef(_mm_dp_ps(a.m128, a.m128, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf)));
-# endif
-}
-
-ccl_device_inline float dot3(const ssef &a, const ssef &b)
-{
-# ifdef __KERNEL_SSE41__
- return extract<0>(ssef(_mm_dp_ps(a.m128, b.m128, 0x7f)));
-# else
- ssef t = a * b;
- return ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2];
-# endif
-}
-
-ccl_device_inline const ssef len3_squared_splat(const ssef &a)
-{
- return dot3_splat(a, a);
-}
-
-ccl_device_inline float len3_squared(const ssef &a)
-{
- return dot3(a, a);
-}
-
-ccl_device_inline float len3(const ssef &a)
-{
- return extract<0>(mm_sqrt(dot3_splat(a, a)));
-}
-
-/* SSE shuffle utility functions */
-
-# ifdef __KERNEL_SSSE3__
-
-/* faster version for SSSE3 */
-typedef ssei shuffle_swap_t;
-
-ccl_device_inline shuffle_swap_t shuffle_swap_identity()
-{
- return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-}
-
-ccl_device_inline shuffle_swap_t shuffle_swap_swap()
-{
- return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
-}
-
-ccl_device_inline const ssef shuffle_swap(const ssef &a, const shuffle_swap_t &shuf)
-{
- return cast(_mm_shuffle_epi8(cast(a), shuf));
-}
-
-# else
-
-/* somewhat slower version for SSE2 */
-typedef int shuffle_swap_t;
-
-ccl_device_inline shuffle_swap_t shuffle_swap_identity()
-{
- return 0;
-}
-
-ccl_device_inline shuffle_swap_t shuffle_swap_swap()
-{
- return 1;
-}
-
-ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf)
-{
- /* shuffle value must be a constant, so we need to branch */
- if (shuf)
- return shuffle<1, 0, 3, 2>(a);
- else
- return shuffle<3, 2, 1, 0>(a);
-}
-
-# endif
-
-# if defined(__KERNEL_SSE41__) && !defined(__KERNEL_NEON__)
-
-ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
- const shuffle_swap_t &shuf_identity,
- const shuffle_swap_t &shuf_swap,
- const float3 &idir,
- ssef idirsplat[3],
- shuffle_swap_t shufflexyz[3])
-{
- const __m128 idirsplat_raw[] = {_mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z)};
- idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn);
- idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn);
- idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn);
-
- const ssef signmask = cast(ssei(0x80000000));
- const ssef shuf_identity_f = cast(shuf_identity);
- const ssef shuf_swap_f = cast(shuf_swap);
-
- shufflexyz[0] = _mm_castps_si128(
- _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask)));
- shufflexyz[1] = _mm_castps_si128(
- _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask)));
- shufflexyz[2] = _mm_castps_si128(
- _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask)));
-}
-
-# else
-
-ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
- const shuffle_swap_t &shuf_identity,
- const shuffle_swap_t &shuf_swap,
- const float3 &idir,
- ssef idirsplat[3],
- shuffle_swap_t shufflexyz[3])
-{
- idirsplat[0] = ssef(idir.x) ^ pn;
- idirsplat[1] = ssef(idir.y) ^ pn;
- idirsplat[2] = ssef(idir.z) ^ pn;
-
- shufflexyz[0] = (idir.x >= 0) ? shuf_identity : shuf_swap;
- shufflexyz[1] = (idir.y >= 0) ? shuf_identity : shuf_swap;
- shufflexyz[2] = (idir.z >= 0) ? shuf_identity : shuf_swap;
-}
-
-# endif
-
-ccl_device_inline const ssef uint32_to_float(const ssei &in)
-{
- ssei a = _mm_srli_epi32(in, 16);
- ssei b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff));
- ssei c = _mm_or_si128(a, _mm_set1_epi32(0x53000000));
- ssef d = _mm_cvtepi32_ps(b);
- ssef e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000)));
- return _mm_add_ps(e, d);
-}
-
-template<size_t S1, size_t S2, size_t S3, size_t S4>
-ccl_device_inline const ssef set_sign_bit(const ssef &a)
-{
- return cast(cast(a) ^ ssei(S1 << 31, S2 << 31, S3 << 31, S4 << 31));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_ssef(const char *label, const ssef &a)
-{
- printf(
- "%s: %.8f %.8f %.8f %.8f\n", label, (double)a[0], (double)a[1], (double)a[2], (double)a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/ssei.h b/intern/cycles/util/ssei.h
deleted file mode 100644
index 5caf44c967f..00000000000
--- a/intern/cycles/util/ssei.h
+++ /dev/null
@@ -1,633 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEI_H__
-#define __UTIL_SSEI_H__
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct sseb;
-struct ssef;
-
-/*! 4-wide SSE integer type. */
-struct ssei {
- typedef sseb Mask; // mask type
- typedef ssei Int; // int type
- typedef ssef Float; // float type
-
- enum { size = 4 }; // number of SIMD elements
- union {
- __m128i m128;
- int32_t i[4];
- }; // data
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Constructors, Assignment & Cast Operators
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline ssei()
- {
- }
- __forceinline ssei(const ssei &a)
- {
- m128 = a.m128;
- }
- __forceinline ssei &operator=(const ssei &a)
- {
- m128 = a.m128;
- return *this;
- }
-
- __forceinline ssei(const __m128i a) : m128(a)
- {
- }
- __forceinline operator const __m128i &(void) const
- {
- return m128;
- }
- __forceinline operator __m128i &(void)
- {
- return m128;
- }
-
- __forceinline ssei(const int a) : m128(_mm_set1_epi32(a))
- {
- }
- __forceinline ssei(int a, int b, int c, int d) : m128(_mm_setr_epi32(a, b, c, d))
- {
- }
-
- __forceinline explicit ssei(const __m128 a) : m128(_mm_cvtps_epi32(a))
- {
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- /// Array Access
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline const int32_t &operator[](const size_t index) const
- {
- assert(index < 4);
- return i[index];
- }
- __forceinline int32_t &operator[](const size_t index)
- {
- assert(index < 4);
- return i[index];
- }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssei cast(const __m128 &a)
-{
- return _mm_castps_si128(a);
-}
-__forceinline const ssei operator+(const ssei &a)
-{
- return a;
-}
-__forceinline const ssei operator-(const ssei &a)
-{
- return _mm_sub_epi32(_mm_setzero_si128(), a.m128);
-}
-# if defined(__KERNEL_SSSE3__)
-__forceinline const ssei abs(const ssei &a)
-{
- return _mm_abs_epi32(a.m128);
-}
-# endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssei operator+(const ssei &a, const ssei &b)
-{
- return _mm_add_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator+(const ssei &a, const int32_t &b)
-{
- return a + ssei(b);
-}
-__forceinline const ssei operator+(const int32_t &a, const ssei &b)
-{
- return ssei(a) + b;
-}
-
-__forceinline const ssei operator-(const ssei &a, const ssei &b)
-{
- return _mm_sub_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator-(const ssei &a, const int32_t &b)
-{
- return a - ssei(b);
-}
-__forceinline const ssei operator-(const int32_t &a, const ssei &b)
-{
- return ssei(a) - b;
-}
-
-# if defined(__KERNEL_SSE41__)
-__forceinline const ssei operator*(const ssei &a, const ssei &b)
-{
- return _mm_mullo_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator*(const ssei &a, const int32_t &b)
-{
- return a * ssei(b);
-}
-__forceinline const ssei operator*(const int32_t &a, const ssei &b)
-{
- return ssei(a) * b;
-}
-# endif
-
-__forceinline const ssei operator&(const ssei &a, const ssei &b)
-{
- return _mm_and_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator&(const ssei &a, const int32_t &b)
-{
- return a & ssei(b);
-}
-__forceinline const ssei operator&(const int32_t &a, const ssei &b)
-{
- return ssei(a) & b;
-}
-
-__forceinline const ssei operator|(const ssei &a, const ssei &b)
-{
- return _mm_or_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator|(const ssei &a, const int32_t &b)
-{
- return a | ssei(b);
-}
-__forceinline const ssei operator|(const int32_t &a, const ssei &b)
-{
- return ssei(a) | b;
-}
-
-__forceinline const ssei operator^(const ssei &a, const ssei &b)
-{
- return _mm_xor_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator^(const ssei &a, const int32_t &b)
-{
- return a ^ ssei(b);
-}
-__forceinline const ssei operator^(const int32_t &a, const ssei &b)
-{
- return ssei(a) ^ b;
-}
-
-__forceinline const ssei operator<<(const ssei &a, const int32_t &n)
-{
- return _mm_slli_epi32(a.m128, n);
-}
-__forceinline const ssei operator>>(const ssei &a, const int32_t &n)
-{
- return _mm_srai_epi32(a.m128, n);
-}
-
-__forceinline const ssei andnot(const ssei &a, const ssei &b)
-{
- return _mm_andnot_si128(a.m128, b.m128);
-}
-__forceinline const ssei andnot(const sseb &a, const ssei &b)
-{
- return _mm_andnot_si128(cast(a.m128), b.m128);
-}
-__forceinline const ssei andnot(const ssei &a, const sseb &b)
-{
- return _mm_andnot_si128(a.m128, cast(b.m128));
-}
-
-__forceinline const ssei sra(const ssei &a, const int32_t &b)
-{
- return _mm_srai_epi32(a.m128, b);
-}
-__forceinline const ssei srl(const ssei &a, const int32_t &b)
-{
- return _mm_srli_epi32(a.m128, b);
-}
-
-# if defined(__KERNEL_SSE41__)
-__forceinline const ssei min(const ssei &a, const ssei &b)
-{
- return _mm_min_epi32(a.m128, b.m128);
-}
-__forceinline const ssei min(const ssei &a, const int32_t &b)
-{
- return min(a, ssei(b));
-}
-__forceinline const ssei min(const int32_t &a, const ssei &b)
-{
- return min(ssei(a), b);
-}
-
-__forceinline const ssei max(const ssei &a, const ssei &b)
-{
- return _mm_max_epi32(a.m128, b.m128);
-}
-__forceinline const ssei max(const ssei &a, const int32_t &b)
-{
- return max(a, ssei(b));
-}
-__forceinline const ssei max(const int32_t &a, const ssei &b)
-{
- return max(ssei(a), b);
-}
-# endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei &operator+=(ssei &a, const ssei &b)
-{
- return a = a + b;
-}
-__forceinline ssei &operator+=(ssei &a, const int32_t &b)
-{
- return a = a + b;
-}
-
-__forceinline ssei &operator-=(ssei &a, const ssei &b)
-{
- return a = a - b;
-}
-__forceinline ssei &operator-=(ssei &a, const int32_t &b)
-{
- return a = a - b;
-}
-
-# if defined(__KERNEL_SSE41__)
-__forceinline ssei &operator*=(ssei &a, const ssei &b)
-{
- return a = a * b;
-}
-__forceinline ssei &operator*=(ssei &a, const int32_t &b)
-{
- return a = a * b;
-}
-# endif
-
-__forceinline ssei &operator&=(ssei &a, const ssei &b)
-{
- return a = a & b;
-}
-__forceinline ssei &operator&=(ssei &a, const int32_t &b)
-{
- return a = a & b;
-}
-
-__forceinline ssei &operator|=(ssei &a, const ssei &b)
-{
- return a = a | b;
-}
-__forceinline ssei &operator|=(ssei &a, const int32_t &b)
-{
- return a = a | b;
-}
-
-__forceinline ssei &operator^=(ssei &a, const ssei &b)
-{
- return a = a ^ b;
-}
-__forceinline ssei &operator^=(ssei &a, const int32_t &b)
-{
- return a = a ^ b;
-}
-
-__forceinline ssei &operator<<=(ssei &a, const int32_t &b)
-{
- return a = a << b;
-}
-__forceinline ssei &operator>>=(ssei &a, const int32_t &b)
-{
- return a = a >> b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator==(const ssei &a, const ssei &b)
-{
- return _mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator==(const ssei &a, const int32_t &b)
-{
- return a == ssei(b);
-}
-__forceinline const sseb operator==(const int32_t &a, const ssei &b)
-{
- return ssei(a) == b;
-}
-
-__forceinline const sseb operator!=(const ssei &a, const ssei &b)
-{
- return !(a == b);
-}
-__forceinline const sseb operator!=(const ssei &a, const int32_t &b)
-{
- return a != ssei(b);
-}
-__forceinline const sseb operator!=(const int32_t &a, const ssei &b)
-{
- return ssei(a) != b;
-}
-
-__forceinline const sseb operator<(const ssei &a, const ssei &b)
-{
- return _mm_castsi128_ps(_mm_cmplt_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator<(const ssei &a, const int32_t &b)
-{
- return a < ssei(b);
-}
-__forceinline const sseb operator<(const int32_t &a, const ssei &b)
-{
- return ssei(a) < b;
-}
-
-__forceinline const sseb operator>=(const ssei &a, const ssei &b)
-{
- return !(a < b);
-}
-__forceinline const sseb operator>=(const ssei &a, const int32_t &b)
-{
- return a >= ssei(b);
-}
-__forceinline const sseb operator>=(const int32_t &a, const ssei &b)
-{
- return ssei(a) >= b;
-}
-
-__forceinline const sseb operator>(const ssei &a, const ssei &b)
-{
- return _mm_castsi128_ps(_mm_cmpgt_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator>(const ssei &a, const int32_t &b)
-{
- return a > ssei(b);
-}
-__forceinline const sseb operator>(const int32_t &a, const ssei &b)
-{
- return ssei(a) > b;
-}
-
-__forceinline const sseb operator<=(const ssei &a, const ssei &b)
-{
- return !(a > b);
-}
-__forceinline const sseb operator<=(const ssei &a, const int32_t &b)
-{
- return a <= ssei(b);
-}
-__forceinline const sseb operator<=(const int32_t &a, const ssei &b)
-{
- return ssei(a) <= b;
-}
-
-__forceinline const ssei select(const sseb &m, const ssei &t, const ssei &f)
-{
-# ifdef __KERNEL_SSE41__
- return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
-# else
- return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
-# endif
-}
-
-__forceinline const ssei select(const int mask, const ssei &t, const ssei &f)
-{
-# if defined(__KERNEL_SSE41__) && \
- ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
- return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
-# else
- return select(sseb(mask), t, f);
-# endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei unpacklo(const ssei &a, const ssei &b)
-{
- return _mm_unpacklo_epi32(a, b);
-}
-__forceinline ssei unpackhi(const ssei &a, const ssei &b)
-{
- return _mm_unpackhi_epi32(a, b);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssei shuffle(const ssei &a)
-{
-# ifdef __KERNEL_NEON__
- int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a));
- return vreinterpretq_m128i_s32(result);
-# else
- return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
-# endif
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssei shuffle(const ssei &a, const ssei &b)
-{
-# ifdef __KERNEL_NEON__
- int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a),
- vreinterpretq_s32_m128i(b));
- return vreinterpretq_m128i_s32(result);
-# else
- return _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-# endif
-}
-
-template<size_t i0> __forceinline const ssei shuffle(const ssei &b)
-{
- return shuffle<i0, i0, i0, i0>(b);
-}
-
-# if defined(__KERNEL_SSE41__)
-template<size_t src> __forceinline int extract(const ssei &b)
-{
- return _mm_extract_epi32(b, src);
-}
-template<size_t dst> __forceinline const ssei insert(const ssei &a, const int32_t b)
-{
- return _mm_insert_epi32(a, b, dst);
-}
-# else
-template<size_t src> __forceinline int extract(const ssei &b)
-{
- return b[src];
-}
-template<size_t dst> __forceinline const ssei insert(const ssei &a, const int32_t b)
-{
- ssei c = a;
- c[dst] = b;
- return c;
-}
-# endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-# if defined(__KERNEL_SSE41__)
-__forceinline const ssei vreduce_min(const ssei &v)
-{
- ssei h = min(shuffle<1, 0, 3, 2>(v), v);
- return min(shuffle<2, 3, 0, 1>(h), h);
-}
-__forceinline const ssei vreduce_max(const ssei &v)
-{
- ssei h = max(shuffle<1, 0, 3, 2>(v), v);
- return max(shuffle<2, 3, 0, 1>(h), h);
-}
-__forceinline const ssei vreduce_add(const ssei &v)
-{
- ssei h = shuffle<1, 0, 3, 2>(v) + v;
- return shuffle<2, 3, 0, 1>(h) + h;
-}
-
-__forceinline int reduce_min(const ssei &v)
-{
-# ifdef __KERNEL_NEON__
- return vminvq_s32(vreinterpretq_s32_m128i(v));
-# else
- return extract<0>(vreduce_min(v));
-# endif
-}
-__forceinline int reduce_max(const ssei &v)
-{
-# ifdef __KERNEL_NEON__
- return vmaxvq_s32(vreinterpretq_s32_m128i(v));
-# else
- return extract<0>(vreduce_max(v));
-# endif
-}
-__forceinline int reduce_add(const ssei &v)
-{
-# ifdef __KERNEL_NEON__
- return vaddvq_s32(vreinterpretq_s32_m128i(v));
-# else
- return extract<0>(vreduce_add(v));
-# endif
-}
-
-__forceinline uint32_t select_min(const ssei &v)
-{
- return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const ssei &v)
-{
- return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const sseb &valid, const ssei &v)
-{
- const ssei a = select(valid, v, ssei((int)pos_inf));
- return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const sseb &valid, const ssei &v)
-{
- const ssei a = select(valid, v, ssei((int)neg_inf));
- return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-# else
-
-__forceinline int ssei_min(int a, int b)
-{
- return (a < b) ? a : b;
-}
-__forceinline int ssei_max(int a, int b)
-{
- return (a > b) ? a : b;
-}
-__forceinline int reduce_min(const ssei &v)
-{
- return ssei_min(ssei_min(v[0], v[1]), ssei_min(v[2], v[3]));
-}
-__forceinline int reduce_max(const ssei &v)
-{
- return ssei_max(ssei_max(v[0], v[1]), ssei_max(v[2], v[3]));
-}
-__forceinline int reduce_add(const ssei &v)
-{
- return v[0] + v[1] + v[2] + v[3];
-}
-
-# endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Memory load and store operations
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei load4i(const void *const a)
-{
- return _mm_load_si128((__m128i *)a);
-}
-
-__forceinline void store4i(void *ptr, const ssei &v)
-{
- _mm_store_si128((__m128i *)ptr, v);
-}
-
-__forceinline void storeu4i(void *ptr, const ssei &v)
-{
- _mm_storeu_si128((__m128i *)ptr, v);
-}
-
-__forceinline void store4i(const sseb &mask, void *ptr, const ssei &i)
-{
-# if defined(__KERNEL_AVX__)
- _mm_maskstore_ps((float *)ptr, (__m128i)mask, _mm_castsi128_ps(i));
-# else
- *(ssei *)ptr = select(mask, i, *(ssei *)ptr);
-# endif
-}
-
-__forceinline ssei load4i_nt(void *ptr)
-{
-# if defined(__KERNEL_SSE41__)
- return _mm_stream_load_si128((__m128i *)ptr);
-# else
- return _mm_load_si128((__m128i *)ptr);
-# endif
-}
-
-__forceinline void store4i_nt(void *ptr, const ssei &v)
-{
-# if defined(__KERNEL_SSE41__)
- _mm_stream_ps((float *)ptr, _mm_castsi128_ps(v));
-# else
- _mm_store_si128((__m128i *)ptr, v);
-# endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_ssei(const char *label, const ssei &a)
-{
- printf("%s: %df %df %df %d\n", label, a[0], a[1], a[2], a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index cb985c65dd8..84116262437 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -102,7 +102,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
return projection_identity();
}
- memcpy(&tfmR, R, sizeof(R));
+ memcpy(&tfmR.x[0], R, sizeof(R));
return tfmR;
}
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 24184dc7074..0c39901a63c 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -63,17 +63,16 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
{
/* TODO(sergey): Disabled for now, causes crashes in certain cases. */
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
- ssef x, y, z, w, aa;
- aa = a.m128;
+ const float4 aa(a.m128);
- x = _mm_loadu_ps(&t->x.x);
- y = _mm_loadu_ps(&t->y.x);
- z = _mm_loadu_ps(&t->z.x);
- w = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
+ float4 x(_mm_loadu_ps(&t->x.x));
+ float4 y(_mm_loadu_ps(&t->y.x));
+ float4 z(_mm_loadu_ps(&t->z.x));
+ float4 w(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f));
- _MM_TRANSPOSE4_PS(x, y, z, w);
+ _MM_TRANSPOSE4_PS(x.m128, y.m128, z.m128, w.m128);
- ssef tmp = w;
+ float4 tmp = w;
tmp = madd(shuffle<2>(aa), z, tmp);
tmp = madd(shuffle<1>(aa), y, tmp);
tmp = madd(shuffle<0>(aa), x, tmp);
@@ -94,16 +93,16 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
ccl_device_inline float3 transform_direction(ccl_private const Transform *t, const float3 a)
{
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
- ssef x, y, z, w, aa;
- aa = a.m128;
- x = _mm_loadu_ps(&t->x.x);
- y = _mm_loadu_ps(&t->y.x);
- z = _mm_loadu_ps(&t->z.x);
- w = _mm_setzero_ps();
+ const float4 aa(a.m128);
- _MM_TRANSPOSE4_PS(x, y, z, w);
+ float4 x(_mm_loadu_ps(&t->x.x));
+ float4 y(_mm_loadu_ps(&t->y.x));
+ float4 z(_mm_loadu_ps(&t->z.x));
+ float4 w(_mm_setzero_ps());
- ssef tmp = shuffle<2>(aa) * z;
+ _MM_TRANSPOSE4_PS(x.m128, y.m128, z.m128, w.m128);
+
+ float4 tmp = shuffle<2>(aa) * z;
tmp = madd(shuffle<1>(aa), y, tmp);
tmp = madd(shuffle<0>(aa), x, tmp);
@@ -197,14 +196,7 @@ ccl_device_inline Transform make_transform_frame(float3 N)
return make_transform(dx.x, dx.y, dx.z, 0.0f, dy.x, dy.y, dy.z, 0.0f, N.x, N.y, N.z, 0.0f);
}
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline Transform transform_zero()
-{
- Transform zero = {zero_float4(), zero_float4(), zero_float4()};
- return zero;
-}
-
+#if !defined(__KERNEL_METAL__)
ccl_device_inline Transform operator*(const Transform a, const Transform b)
{
float4 c_x = make_float4(b.x.x, b.y.x, b.z.x, 0.0f);
@@ -219,6 +211,15 @@ ccl_device_inline Transform operator*(const Transform a, const Transform b)
return t;
}
+#endif
+
+#ifndef __KERNEL_GPU__
+
+ccl_device_inline Transform transform_zero()
+{
+ Transform zero = {zero_float4(), zero_float4(), zero_float4()};
+ return zero;
+}
ccl_device_inline void print_transform(const char *label, const Transform &t)
{
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h
index bb410a6daef..2faac576d82 100644
--- a/intern/cycles/util/transform_inverse.h
+++ b/intern/cycles/util/transform_inverse.h
@@ -9,26 +9,33 @@ CCL_NAMESPACE_BEGIN
* Normally we don't use SSE41/AVX outside the kernel, but for this it's
* important to match exactly for ray tracing precision. */
-ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const float3 b_)
{
#if defined(__AVX2__) && defined(__KERNEL_SSE2__)
- const ssef sse_a = (const __m128 &)a;
- const ssef sse_b = (const __m128 &)b;
- const ssef r = shuffle<1, 2, 0, 3>(
- ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+ const __m128 a = (const __m128 &)a_;
+ const __m128 b = (const __m128 &)b_;
+ const __m128 a_shuffle = _mm_castsi128_ps(
+ _mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(3, 0, 2, 1)));
+ const __m128 b_shuffle = _mm_castsi128_ps(
+ _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(3, 0, 2, 1)));
+ const __m128 r = _mm_castsi128_ps(
+ _mm_shuffle_epi32(_mm_castps_si128(_mm_fmsub_ps(a, b_shuffle, _mm_mul_ps(a_shuffle, b))),
+ _MM_SHUFFLE(3, 0, 2, 1)));
return (const float3 &)r;
#endif
- return cross(a, b);
+ return cross(a_, b_);
}
-ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_)
{
-#ifdef __SSE4_1__
- return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+ const __m128 a = (const __m128 &)a_;
+ const __m128 b = (const __m128 &)b_;
+ return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
#endif
- return dot(a, b);
+ return dot(a_, b_);
}
ccl_device_forceinline Transform transform_inverse_impl(const Transform tfm)
diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h
index 1ab6f76f9bc..cf7f35c4116 100644
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -97,6 +97,7 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
#include "util/types_int2.h"
#include "util/types_int3.h"
#include "util/types_int4.h"
+#include "util/types_int8.h"
#include "util/types_uint2.h"
#include "util/types_uint3.h"
@@ -119,6 +120,7 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
#include "util/types_int2_impl.h"
#include "util/types_int3_impl.h"
#include "util/types_int4_impl.h"
+#include "util/types_int8_impl.h"
#include "util/types_uint2_impl.h"
#include "util/types_uint3_impl.h"
@@ -129,16 +131,4 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
#include "util/types_float4_impl.h"
#include "util/types_float8_impl.h"
-/* SSE types. */
-#ifndef __KERNEL_GPU__
-# include "util/sseb.h"
-# include "util/ssef.h"
-# include "util/ssei.h"
-# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-# include "util/avxb.h"
-# include "util/avxf.h"
-# include "util/avxi.h"
-# endif
-#endif
-
#endif /* __UTIL_TYPES_H__ */
diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h
index 29fd632f08e..121141ddfd9 100644
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@@ -11,15 +11,15 @@
CCL_NAMESPACE_BEGIN
/* float8 is a reserved type in Metal that has not been implemented. For
- * that reason this is named float8_t and not using native vector types. */
+ * that reason this is named vfloat8 and not using native vector types. */
#ifdef __KERNEL_GPU__
-struct float8_t
+struct vfloat8
#else
-struct ccl_try_align(32) float8_t
+struct ccl_try_align(32) vfloat8
#endif
{
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
union {
__m256 m256;
struct {
@@ -27,18 +27,18 @@ struct ccl_try_align(32) float8_t
};
};
- __forceinline float8_t();
- __forceinline float8_t(const float8_t &a);
- __forceinline explicit float8_t(const __m256 &a);
+ __forceinline vfloat8();
+ __forceinline vfloat8(const vfloat8 &a);
+ __forceinline explicit vfloat8(const __m256 &a);
__forceinline operator const __m256 &() const;
__forceinline operator __m256 &();
- __forceinline float8_t &operator=(const float8_t &a);
+ __forceinline vfloat8 &operator=(const vfloat8 &a);
-#else /* __KERNEL_AVX2__ */
+#else /* __KERNEL_AVX__ */
float a, b, c, d, e, f, g, h;
-#endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX__ */
#ifndef __KERNEL_GPU__
__forceinline float operator[](int i) const;
@@ -46,8 +46,11 @@ struct ccl_try_align(32) float8_t
#endif
};
-ccl_device_inline float8_t make_float8_t(float f);
-ccl_device_inline float8_t
-make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h);
+ccl_device_inline vfloat8 make_vfloat8(float f);
+ccl_device_inline vfloat8
+make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h);
+ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b);
+
+ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a);
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h
index e8576cdaf70..9f42e0f663c 100644
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@@ -10,45 +10,45 @@
CCL_NAMESPACE_BEGIN
-#ifdef __KERNEL_AVX2__
-__forceinline float8_t::float8_t()
+#ifdef __KERNEL_AVX__
+__forceinline vfloat8::vfloat8()
{
}
-__forceinline float8_t::float8_t(const float8_t &f) : m256(f.m256)
+__forceinline vfloat8::vfloat8(const vfloat8 &f) : m256(f.m256)
{
}
-__forceinline float8_t::float8_t(const __m256 &f) : m256(f)
+__forceinline vfloat8::vfloat8(const __m256 &f) : m256(f)
{
}
-__forceinline float8_t::operator const __m256 &() const
+__forceinline vfloat8::operator const __m256 &() const
{
return m256;
}
-__forceinline float8_t::operator __m256 &()
+__forceinline vfloat8::operator __m256 &()
{
return m256;
}
-__forceinline float8_t &float8_t::operator=(const float8_t &f)
+__forceinline vfloat8 &vfloat8::operator=(const vfloat8 &f)
{
m256 = f.m256;
return *this;
}
-#endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX__ */
#ifndef __KERNEL_GPU__
-__forceinline float float8_t::operator[](int i) const
+__forceinline float vfloat8::operator[](int i) const
{
util_assert(i >= 0);
util_assert(i < 8);
return *(&a + i);
}
-__forceinline float &float8_t::operator[](int i)
+__forceinline float &vfloat8::operator[](int i)
{
util_assert(i >= 0);
util_assert(i < 8);
@@ -56,25 +56,50 @@ __forceinline float &float8_t::operator[](int i)
}
#endif
-ccl_device_inline float8_t make_float8_t(float f)
+ccl_device_inline vfloat8 make_vfloat8(float f)
{
-#ifdef __KERNEL_AVX2__
- float8_t r(_mm256_set1_ps(f));
+#ifdef __KERNEL_AVX__
+ vfloat8 r(_mm256_set1_ps(f));
#else
- float8_t r = {f, f, f, f, f, f, f, f};
+ vfloat8 r = {f, f, f, f, f, f, f, f};
#endif
return r;
}
-ccl_device_inline float8_t
-make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h)
+ccl_device_inline vfloat8
+make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
{
-#ifdef __KERNEL_AVX2__
- float8_t r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
+#ifdef __KERNEL_AVX__
+ vfloat8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
#else
- float8_t r = {a, b, c, d, e, f, g, h};
+ vfloat8 r = {a, b, c, d, e, f, g, h};
#endif
return r;
}
+ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b)
+{
+#ifdef __KERNEL_AVX__
+ return vfloat8(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1));
+#else
+ return make_vfloat8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
+#endif
+}
+
+ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a)
+{
+#ifdef __KERNEL_PRINTF__
+ printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n",
+ label,
+ (double)a.a,
+ (double)a.b,
+ (double)a.c,
+ (double)a.d,
+ (double)a.e,
+ (double)a.f,
+ (double)a.g,
+ (double)a.h);
+#endif
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_int8.h b/intern/cycles/util/types_int8.h
new file mode 100644
index 00000000000..8643ebe96ad
--- /dev/null
+++ b/intern/cycles/util/types_int8.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+struct vfloat8;
+
+#ifdef __KERNEL_GPU__
+struct vint8
+#else
+struct ccl_try_align(32) vint8
+#endif
+{
+#ifdef __KERNEL_AVX__
+ union {
+ __m256i m256;
+ struct {
+ int a, b, c, d, e, f, g, h;
+ };
+ };
+
+ __forceinline vint8();
+ __forceinline vint8(const vint8 &a);
+ __forceinline explicit vint8(const __m256i &a);
+
+ __forceinline operator const __m256i &() const;
+ __forceinline operator __m256i &();
+
+ __forceinline vint8 &operator=(const vint8 &a);
+#else /* __KERNEL_AVX__ */
+ int a, b, c, d, e, f, g, h;
+#endif /* __KERNEL_AVX__ */
+
+#ifndef __KERNEL_GPU__
+ __forceinline int operator[](int i) const;
+ __forceinline int &operator[](int i);
+#endif
+};
+
+ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h);
+ccl_device_inline vint8 make_vint8(int i);
+ccl_device_inline vint8 make_vint8(const vfloat8 f);
+ccl_device_inline vint8 make_vint8(const int4 a, const int4 b);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_int8_impl.h b/intern/cycles/util/types_int8_impl.h
new file mode 100644
index 00000000000..080bcaa6a2b
--- /dev/null
+++ b/intern/cycles/util/types_int8_impl.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#ifndef __UTIL_TYPES_H__
+# error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_AVX__
+__forceinline vint8::vint8()
+{
+}
+
+__forceinline vint8::vint8(const vint8 &a) : m256(a.m256)
+{
+}
+
+__forceinline vint8::vint8(const __m256i &a) : m256(a)
+{
+}
+
+__forceinline vint8::operator const __m256i &() const
+{
+ return m256;
+}
+
+__forceinline vint8::operator __m256i &()
+{
+ return m256;
+}
+
+__forceinline vint8 &vint8::operator=(const vint8 &a)
+{
+ m256 = a.m256;
+ return *this;
+}
+#endif /* __KERNEL_AVX__ */
+
+#ifndef __KERNEL_GPU__
+__forceinline int vint8::operator[](int i) const
+{
+ util_assert(i >= 0);
+ util_assert(i < 8);
+ return *(&a + i);
+}
+
+__forceinline int &vint8::operator[](int i)
+{
+ util_assert(i >= 0);
+ util_assert(i < 8);
+ return *(&a + i);
+}
+#endif
+
+ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h)
+{
+#ifdef __KERNEL_AVX__
+ return vint8(_mm256_set_epi32(h, g, f, e, d, c, b, a));
+#else
+ return {a, b, c, d, e, f, g, h};
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(int i)
+{
+#ifdef __KERNEL_AVX__
+ return vint8(_mm256_set1_epi32(i));
+#else
+ return make_vint8(i, i, i, i, i, i, i, i);
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(const vfloat8 f)
+{
+#ifdef __KERNEL_AVX__
+ return vint8(_mm256_cvtps_epi32(f.m256));
+#else
+ return make_vint8(
+ (int)f.a, (int)f.b, (int)f.c, (int)f.d, (int)f.e, (int)f.f, (int)f.g, (int)f.h);
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(const int4 a, const int4 b)
+{
+#ifdef __KERNEL_AVX__
+ return vint8(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128), b.m128, 1));
+#else
+ return make_vint8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/ffmpeg/CMakeLists.txt b/intern/ffmpeg/CMakeLists.txt
index 0de8496f3f3..4fb5df9d4cd 100644
--- a/intern/ffmpeg/CMakeLists.txt
+++ b/intern/ffmpeg/CMakeLists.txt
@@ -6,6 +6,7 @@ if(WITH_GTESTS)
tests/ffmpeg_codecs.cc
)
set(TEST_INC
+ .
)
set(TEST_INC_SYS
${FFMPEG_INCLUDE_DIRS}
diff --git a/intern/ffmpeg/ffmpeg_compat.h b/intern/ffmpeg/ffmpeg_compat.h
index f311e04d8e0..f7d87af8bca 100644
--- a/intern/ffmpeg/ffmpeg_compat.h
+++ b/intern/ffmpeg/ffmpeg_compat.h
@@ -36,6 +36,14 @@
# define FFMPEG_INLINE static inline
#endif
+#if (LIBAVFORMAT_VERSION_MAJOR < 59)
+/* For versions older than ffmpeg 5.0, use the old channel layout variables.
+ * We intend to only keep this workaround for around two releases (3.5, 3.6).
+ * If it sticks around any longer, then we should consider refactoring this.
+ */
+# define FFMPEG_USE_OLD_CHANNEL_VARS
+#endif
+
#if (LIBAVFORMAT_VERSION_MAJOR < 58) || \
((LIBAVFORMAT_VERSION_MAJOR == 58) && (LIBAVFORMAT_VERSION_MINOR < 76))
# define FFMPEG_USE_DURATION_WORKAROUND 1
diff --git a/intern/ffmpeg/tests/ffmpeg_codecs.cc b/intern/ffmpeg/tests/ffmpeg_codecs.cc
index 10cbe4b938b..cd06917f59b 100644
--- a/intern/ffmpeg/tests/ffmpeg_codecs.cc
+++ b/intern/ffmpeg/tests/ffmpeg_codecs.cc
@@ -3,6 +3,8 @@
#include "testing/testing.h"
extern "C" {
+#include "ffmpeg_compat.h"
+
#include <libavcodec/avcodec.h>
#include <libavutil/channel_layout.h>
#include <libavutil/log.h>
@@ -40,7 +42,11 @@ bool test_acodec(const AVCodec *codec, AVSampleFormat fmt)
if (ctx) {
ctx->sample_fmt = fmt;
ctx->sample_rate = 48000;
+#ifdef FFMPEG_USE_OLD_CHANNEL_VARS
+ ctx->channel_layout = AV_CH_LAYOUT_MONO;
+#else
av_channel_layout_from_mask(&ctx->ch_layout, AV_CH_LAYOUT_MONO);
+#endif
ctx->bit_rate = 128000;
int open = avcodec_open2(ctx, codec, NULL);
if (open >= 0) {
diff --git a/intern/ghost/intern/GHOST_SystemWayland.cpp b/intern/ghost/intern/GHOST_SystemWayland.cpp
index 3a0ba5cd21a..528aa6e1884 100644
--- a/intern/ghost/intern/GHOST_SystemWayland.cpp
+++ b/intern/ghost/intern/GHOST_SystemWayland.cpp
@@ -1157,6 +1157,18 @@ static void gwl_registry_entry_update_all(GWL_Display *display, const int interf
/** \name Private Utility Functions
* \{ */
+static void ghost_wl_display_report_error(struct wl_display *display)
+{
+ int ecode = wl_display_get_error(display);
+ GHOST_ASSERT(ecode, "Error not set!");
+ if ((ecode == EPIPE || ecode == ECONNRESET)) {
+ fprintf(stderr, "The Wayland connection broke. Did the Wayland compositor die?\n");
+ }
+ else {
+ fprintf(stderr, "The Wayland connection experienced a fatal error: %s\n", strerror(ecode));
+ }
+}
+
/**
* Callback for WAYLAND to run when there is an error.
*
@@ -1264,6 +1276,12 @@ static GHOST_TKey xkb_map_gkey(const xkb_keysym_t sym)
GXMAP(gkey, XKB_KEY_XF86AudioStop, GHOST_kKeyMediaStop);
GXMAP(gkey, XKB_KEY_XF86AudioPrev, GHOST_kKeyMediaFirst);
GXMAP(gkey, XKB_KEY_XF86AudioNext, GHOST_kKeyMediaLast);
+
+ /* Additional keys for non US layouts. */
+
+ /* Uses the same physical key as #XKB_KEY_KP_Decimal for QWERTZ layout, see: T102287. */
+ GXMAP(gkey, XKB_KEY_KP_Separator, GHOST_kKeyNumpadPeriod);
+
default:
/* Rely on #xkb_map_gkey_or_scan_code to report when no key can be found. */
gkey = GHOST_kKeyUnknown;
@@ -4979,6 +4997,42 @@ static const struct wl_registry_listener registry_listener = {
/** \} */
/* -------------------------------------------------------------------- */
+/** \name Listener (Display), #wl_display_listener
+ * \{ */
+
+static CLG_LogRef LOG_WL_DISPLAY = {"ghost.wl.handle.display"};
+#define LOG (&LOG_WL_DISPLAY)
+
+static void display_handle_error(
+ void *data, struct wl_display *wl_display, void *object_id, uint32_t code, const char *message)
+{
+ GWL_Display *display = static_cast<GWL_Display *>(data);
+ GHOST_ASSERT(display->wl_display == wl_display, "Invalid internal state");
+ (void)display;
+
+ /* NOTE: code is #wl_display_error, there isn't a convenient way to convert to an ID. */
+ CLOG_INFO(LOG, 2, "error (code=%u, object_id=%p, message=%s)", code, object_id, message);
+}
+
+static void display_handle_delete_id(void *data, struct wl_display *wl_display, uint32_t id)
+{
+ GWL_Display *display = static_cast<GWL_Display *>(data);
+ GHOST_ASSERT(display->wl_display == wl_display, "Invalid internal state");
+ (void)display;
+
+ CLOG_INFO(LOG, 2, "delete_id (id=%u)", id);
+}
+
+static const struct wl_display_listener display_listener = {
+ display_handle_error,
+ display_handle_delete_id,
+};
+
+#undef LOG
+
+/** \} */
+
+/* -------------------------------------------------------------------- */
/** \name GHOST Implementation
*
* WAYLAND specific implementation of the #GHOST_System interface.
@@ -5000,6 +5054,8 @@ GHOST_SystemWayland::GHOST_SystemWayland(bool background)
/* This may be removed later if decorations are required, needed as part of registration. */
display_->xdg_decor = new GWL_XDG_Decor_System;
+ wl_display_add_listener(display_->wl_display, &display_listener, display_);
+
/* Register interfaces. */
{
display_->registry_skip_update_all = true;
@@ -5059,6 +5115,8 @@ GHOST_SystemWayland::GHOST_SystemWayland(bool background)
}
}
else
+#else
+ (void)background;
#endif
{
GWL_XDG_Decor_System &decor = *display_->xdg_decor;
@@ -5106,10 +5164,14 @@ bool GHOST_SystemWayland::processEvents(bool waitForEvent)
#endif /* WITH_INPUT_NDOF */
if (waitForEvent) {
- wl_display_dispatch(display_->wl_display);
+ if (wl_display_dispatch(display_->wl_display) == -1) {
+ ghost_wl_display_report_error(display_->wl_display);
+ }
}
else {
- wl_display_roundtrip(display_->wl_display);
+ if (wl_display_roundtrip(display_->wl_display) == -1) {
+ ghost_wl_display_report_error(display_->wl_display);
+ }
}
if (getEventManager()->getNumEvents() > 0) {
diff --git a/intern/ghost/intern/GHOST_WindowWayland.cpp b/intern/ghost/intern/GHOST_WindowWayland.cpp
index ad94a02b514..9d62c69edef 100644
--- a/intern/ghost/intern/GHOST_WindowWayland.cpp
+++ b/intern/ghost/intern/GHOST_WindowWayland.cpp
@@ -219,7 +219,7 @@ static void xdg_toplevel_handle_close(void *data, xdg_toplevel * /*xdg_toplevel*
static_cast<GWL_Window *>(data)->ghost_window->close();
}
-static const xdg_toplevel_listener toplevel_listener = {
+static const xdg_toplevel_listener xdg_toplevel_listener = {
xdg_toplevel_handle_configure,
xdg_toplevel_handle_close,
};
@@ -322,7 +322,7 @@ static void xdg_toplevel_decoration_handle_configure(
static_cast<GWL_Window *>(data)->xdg_decor->mode = (zxdg_toplevel_decoration_v1_mode)mode;
}
-static const zxdg_toplevel_decoration_v1_listener toplevel_decoration_v1_listener = {
+static const zxdg_toplevel_decoration_v1_listener xdg_toplevel_decoration_v1_listener = {
xdg_toplevel_decoration_handle_configure,
};
@@ -418,7 +418,7 @@ static void surface_handle_leave(void *data,
}
}
-static struct wl_surface_listener wl_surface_listener = {
+static const struct wl_surface_listener wl_surface_listener = {
surface_handle_enter,
surface_handle_leave,
};
@@ -483,7 +483,7 @@ GHOST_WindowWayland::GHOST_WindowWayland(GHOST_SystemWayland *system,
wl_surface_set_buffer_scale(window_->wl_surface, window_->scale);
- wl_surface_add_listener(window_->wl_surface, &wl_surface_listener, this);
+ wl_surface_add_listener(window_->wl_surface, &wl_surface_listener, window_);
window_->egl_window = wl_egl_window_create(
window_->wl_surface, int(window_->size[0]), int(window_->size[1]));
@@ -537,13 +537,13 @@ GHOST_WindowWayland::GHOST_WindowWayland(GHOST_SystemWayland *system,
decor.toplevel_decor = zxdg_decoration_manager_v1_get_toplevel_decoration(
system_->xdg_decor_manager(), decor.toplevel);
zxdg_toplevel_decoration_v1_add_listener(
- decor.toplevel_decor, &toplevel_decoration_v1_listener, window_);
+ decor.toplevel_decor, &xdg_toplevel_decoration_v1_listener, window_);
zxdg_toplevel_decoration_v1_set_mode(decor.toplevel_decor,
ZXDG_TOPLEVEL_DECORATION_V1_MODE_SERVER_SIDE);
}
xdg_surface_add_listener(decor.surface, &xdg_surface_listener, window_);
- xdg_toplevel_add_listener(decor.toplevel, &toplevel_listener, window_);
+ xdg_toplevel_add_listener(decor.toplevel, &xdg_toplevel_listener, window_);
if (parentWindow && is_dialog) {
WGL_XDG_Decor_Window &decor_parent =
diff --git a/intern/mantaflow/intern/MANTA_main.cpp b/intern/mantaflow/intern/MANTA_main.cpp
index 5708cdc81aa..d94c8943f78 100644
--- a/intern/mantaflow/intern/MANTA_main.cpp
+++ b/intern/mantaflow/intern/MANTA_main.cpp
@@ -58,7 +58,6 @@ MANTA::MANTA(int *res, FluidModifierData *fmd)
mUsingDiffusion = (fds->flags & FLUID_DOMAIN_USE_DIFFUSION) && mUsingLiquid;
mUsingViscosity = (fds->flags & FLUID_DOMAIN_USE_VISCOSITY) && mUsingLiquid;
mUsingMVel = (fds->flags & FLUID_DOMAIN_USE_SPEED_VECTORS) && mUsingLiquid;
- mUsingGuiding = (fds->flags & FLUID_DOMAIN_USE_GUIDE);
mUsingDrops = (fds->particle_type & FLUID_DOMAIN_PARTICLE_SPRAY) && mUsingLiquid;
mUsingBubbles = (fds->particle_type & FLUID_DOMAIN_PARTICLE_BUBBLE) && mUsingLiquid;
mUsingFloats = (fds->particle_type & FLUID_DOMAIN_PARTICLE_FOAM) && mUsingLiquid;
@@ -68,6 +67,7 @@ MANTA::MANTA(int *res, FluidModifierData *fmd)
mUsingFire = (fds->active_fields & FLUID_DOMAIN_ACTIVE_FIRE) && mUsingSmoke;
mUsingColors = (fds->active_fields & FLUID_DOMAIN_ACTIVE_COLORS) && mUsingSmoke;
mUsingObstacle = (fds->active_fields & FLUID_DOMAIN_ACTIVE_OBSTACLE);
+ mUsingGuiding = (fds->active_fields & FLUID_DOMAIN_ACTIVE_GUIDE);
mUsingInvel = (fds->active_fields & FLUID_DOMAIN_ACTIVE_INVEL);
mUsingOutflow = (fds->active_fields & FLUID_DOMAIN_ACTIVE_OUTFLOW);
diff --git a/intern/wayland_dynload/extern/wayland_dynload_client.h b/intern/wayland_dynload/extern/wayland_dynload_client.h
index d80ef5c9f0c..bf1e2f89c18 100644
--- a/intern/wayland_dynload/extern/wayland_dynload_client.h
+++ b/intern/wayland_dynload/extern/wayland_dynload_client.h
@@ -16,6 +16,7 @@ WAYLAND_DYNLOAD_FN(wl_display_disconnect)
WAYLAND_DYNLOAD_FN(wl_display_dispatch)
WAYLAND_DYNLOAD_FN(wl_display_roundtrip)
WAYLAND_DYNLOAD_FN(wl_display_flush)
+WAYLAND_DYNLOAD_FN(wl_display_get_error)
WAYLAND_DYNLOAD_FN(wl_log_set_handler_client)
WAYLAND_DYNLOAD_FN(wl_proxy_add_listener)
WAYLAND_DYNLOAD_FN(wl_proxy_destroy)
@@ -68,6 +69,7 @@ struct WaylandDynload_Client {
int WL_DYN_FN(wl_display_dispatch)(struct wl_display *display);
int WL_DYN_FN(wl_display_roundtrip)(struct wl_display *display);
int WL_DYN_FN(wl_display_flush)(struct wl_display *display);
+ int WL_DYN_FN(wl_display_get_error)(struct wl_display *display);
void WL_DYN_FN(wl_log_set_handler_client)(wl_log_func_t);
int WL_DYN_FN(wl_proxy_add_listener)(struct wl_proxy *proxy,
void (**implementation)(void),
@@ -103,6 +105,7 @@ struct WaylandDynload_Client {
# define wl_display_dispatch(...) (*wayland_dynload_client.wl_display_dispatch)(__VA_ARGS__)
# define wl_display_roundtrip(...) (*wayland_dynload_client.wl_display_roundtrip)(__VA_ARGS__)
# define wl_display_flush(...) (*wayland_dynload_client.wl_display_flush)(__VA_ARGS__)
+# define wl_display_get_error(...) (*wayland_dynload_client.wl_display_get_error)(__VA_ARGS__)
# define wl_log_set_handler_client(...) \
(*wayland_dynload_client.wl_log_set_handler_client)(__VA_ARGS__)
# define wl_proxy_add_listener(...) \