diff options
author | Hans Goudey <h.goudey@me.com> | 2022-11-11 04:14:48 +0300 |
---|---|---|
committer | Hans Goudey <h.goudey@me.com> | 2022-11-11 04:41:13 +0300 |
commit | 9465b109af0b70244a36f9e68493e316d9f8b56f (patch) | |
tree | c4d933bf09f8313071556063156538ec3356b24c /intern/cycles | |
parent | 026d21a225521670c6b5083da9da61227da69e65 (diff) | |
parent | ca1642cd0c5cdf634fe2022c955d93983de95934 (diff) |
Merge branch 'master' into refactor-mesh-position-genericrefactor-mesh-position-generic
Diffstat (limited to 'intern/cycles')
84 files changed, 5355 insertions, 5230 deletions
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py index 05f27bdbd4d..354c9c23a53 100644 --- a/intern/cycles/blender/addon/__init__.py +++ b/intern/cycles/blender/addon/__init__.py @@ -58,7 +58,7 @@ class CyclesRender(bpy.types.RenderEngine): if not self.session: if self.is_preview: cscene = bpy.context.scene.cycles - use_osl = cscene.shading_system and cscene.device == 'CPU' + use_osl = cscene.shading_system engine.create(self, data, preview_osl=use_osl) else: diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index e33891fa7a2..4ac078ed8a5 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -156,6 +156,11 @@ def with_osl(): return _cycles.with_osl +def osl_version(): + import _cycles + return _cycles.osl_version + + def with_path_guiding(): import _cycles return _cycles.with_path_guiding diff --git a/intern/cycles/blender/addon/operators.py b/intern/cycles/blender/addon/operators.py index ab474cda0ab..3680d11359e 100644 --- a/intern/cycles/blender/addon/operators.py +++ b/intern/cycles/blender/addon/operators.py @@ -114,7 +114,7 @@ class CYCLES_OT_denoise_animation(Operator): class CYCLES_OT_merge_images(Operator): - "Combine OpenEXR multilayer images rendered with different sample " \ + "Combine OpenEXR multi-layer images rendered with different sample " \ "ranges into one image with reduced noise" bl_idname = "cycles.merge_images" bl_label = "Merge Images" diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index f5cd88f6b6a..9d7c71417f2 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -290,7 +290,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): ) shading_system: BoolProperty( name="Open Shading Language", - description="Use Open Shading Language (CPU rendering only)", + description="Use Open Shading Language", ) preview_pause: BoolProperty( diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 305accc8f1a..10a37688f45 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -2305,7 +2305,10 @@ def draw_device(self, context): col.prop(cscene, "device") from . import engine - if engine.with_osl() and use_cpu(context): + if engine.with_osl() and ( + use_cpu(context) or + (use_optix(context) and (engine.osl_version()[1] >= 13 or engine.osl_version()[0] > 1)) + ): col.prop(cscene, "shading_system") diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index 01c021551f3..c9764d1c21b 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -232,7 +232,7 @@ string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features) return cflags; } -string CUDADevice::compile_kernel(const uint kernel_features, +string CUDADevice::compile_kernel(const string &common_cflags, const char *name, const char *base, bool force_ptx) @@ -281,7 +281,6 @@ string CUDADevice::compile_kernel(const uint kernel_features, /* We include cflags into md5 so changing cuda toolkit or changing other * compiler command line arguments makes sure cubin gets re-built. */ - string common_cflags = compile_kernel_get_common_cflags(kernel_features); const string kernel_md5 = util_md5_string(source_md5 + common_cflags); const char *const kernel_ext = force_ptx ? "ptx" : "cubin"; @@ -417,7 +416,8 @@ bool CUDADevice::load_kernels(const uint kernel_features) /* get kernel */ const char *kernel_name = "kernel"; - string cubin = compile_kernel(kernel_features, kernel_name); + string cflags = compile_kernel_get_common_cflags(kernel_features); + string cubin = compile_kernel(cflags, kernel_name); if (cubin.empty()) return false; diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h index a754c33f79d..c18f2811161 100644 --- a/intern/cycles/device/cuda/device_impl.h +++ b/intern/cycles/device/cuda/device_impl.h @@ -77,9 +77,9 @@ class CUDADevice : public Device { bool use_adaptive_compilation(); - virtual string compile_kernel_get_common_cflags(const uint kernel_features); + string compile_kernel_get_common_cflags(const uint kernel_features); - string compile_kernel(const uint kernel_features, + string compile_kernel(const string &cflags, const char *name, const char *base = "cuda", bool force_ptx = false); diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 2e4d18241cf..06a2f5c7b01 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -160,6 +160,11 @@ class Device { return true; } + virtual bool load_osl_kernels() + { + return true; + } + /* GPU device only functions. * These may not be used on CPU or multi-devices. */ diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h index 9afef3789af..efdc15dca79 100644 --- a/intern/cycles/device/hip/device_impl.h +++ b/intern/cycles/device/hip/device_impl.h @@ -74,7 +74,7 @@ class HIPDevice : public Device { bool use_adaptive_compilation(); - virtual string compile_kernel_get_common_cflags(const uint kernel_features); + string compile_kernel_get_common_cflags(const uint kernel_features); string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip"); diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp index 96a99cd62cd..27ca0d81817 100644 --- a/intern/cycles/device/kernel.cpp +++ b/intern/cycles/device/kernel.cpp @@ -7,6 +7,30 @@ CCL_NAMESPACE_BEGIN +bool device_kernel_has_shading(DeviceKernel kernel) +{ + return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW || + kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE || + kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND || + kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY); +} + +bool device_kernel_has_intersection(DeviceKernel kernel) +{ + return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE); +} + const char *device_kernel_as_string(DeviceKernel kernel) { switch (kernel) { diff --git a/intern/cycles/device/kernel.h b/intern/cycles/device/kernel.h index 4ae461f1f67..b829a891260 100644 --- a/intern/cycles/device/kernel.h +++ b/intern/cycles/device/kernel.h @@ -11,6 +11,9 @@ CCL_NAMESPACE_BEGIN +bool device_kernel_has_shading(DeviceKernel kernel); +bool device_kernel_has_intersection(DeviceKernel kernel); + const char *device_kernel_as_string(DeviceKernel kernel); std::ostream &operator<<(std::ostream &os, DeviceKernel kernel); diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 55938d1a03a..dc8af9a5358 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -45,6 +45,36 @@ bool kernel_has_intersection(DeviceKernel device_kernel) struct ShaderCache { ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice) { + /* Initialize occupancy tuning LUT. */ + if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) { + switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) { + default: + case APPLE_M2: + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024}; + break; + case APPLE_M1: + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832}; + break; + } + } } ~ShaderCache(); @@ -73,6 +103,11 @@ struct ShaderCache { std::function<void(MetalKernelPipeline *)> completionHandler; }; + struct OccupancyTuningParameters { + int threads_per_threadgroup = 0; + int num_threads_per_block = 0; + } occupancy_tuning[DEVICE_KERNEL_NUM]; + std::mutex cache_mutex; PipelineCollection pipelines[DEVICE_KERNEL_NUM]; @@ -230,6 +265,13 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, request.pipeline->device_kernel = device_kernel; request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup; + if (occupancy_tuning[device_kernel].threads_per_threadgroup) { + request.pipeline->threads_per_threadgroup = + occupancy_tuning[device_kernel].threads_per_threadgroup; + request.pipeline->num_threads_per_block = + occupancy_tuning[device_kernel].num_threads_per_block; + } + /* metalrt options */ request.pipeline->use_metalrt = device->use_metalrt; request.pipeline->metalrt_hair = device->use_metalrt && @@ -374,13 +416,6 @@ void MetalKernelPipeline::compile() const std::string function_name = std::string("cycles_metal_") + device_kernel_as_string(device_kernel); - int threads_per_threadgroup = this->threads_per_threadgroup; - if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && - device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) { - /* Always use 512 for the sorting kernels */ - threads_per_threadgroup = 512; - } - NSString *entryPoint = [@(function_name.c_str()) copy]; NSError *error = NULL; @@ -644,12 +679,14 @@ void MetalKernelPipeline::compile() return; } - int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup, - computePipelineState.threadExecutionWidth); - num_threads_per_block = std::max(num_threads_per_block, - (int)computePipelineState.threadExecutionWidth); + if (!num_threads_per_block) { + num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup, + computePipelineState.threadExecutionWidth); + num_threads_per_block = std::max(num_threads_per_block, + (int)computePipelineState.threadExecutionWidth); + } + this->pipeline = computePipelineState; - this->num_threads_per_block = num_threads_per_block; if (@available(macOS 11.0, *)) { if (creating_new_archive || recreate_archive) { diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp index 6904d2c2dc6..9605c6a7538 100644 --- a/intern/cycles/device/multi/device.cpp +++ b/intern/cycles/device/multi/device.cpp @@ -138,6 +138,15 @@ class MultiDevice : public Device { return true; } + bool load_osl_kernels() override + { + foreach (SubDevice &sub, devices) + if (!sub.device->load_osl_kernels()) + return false; + + return true; + } + void build_bvh(BVH *bvh, Progress &progress, bool refit) override { /* Try to build and share a single acceleration structure, if possible */ @@ -204,10 +213,12 @@ class MultiDevice : public Device { virtual void *get_cpu_osl_memory() override { - if (devices.size() > 1) { + /* Always return the OSL memory of the CPU device (this works since the constructor above + * guarantees that CPU devices are always added to the back). */ + if (devices.size() > 1 && devices.back().device->info.type != DEVICE_CPU) { return NULL; } - return devices.front().device->get_cpu_osl_memory(); + return devices.back().device->get_cpu_osl_memory(); } bool is_resident(device_ptr key, Device *sub_device) override diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp index 68ca21374fd..58b72374a7d 100644 --- a/intern/cycles/device/optix/device.cpp +++ b/intern/cycles/device/optix/device.cpp @@ -9,6 +9,10 @@ #include "util/log.h" +#ifdef WITH_OSL +# include <OSL/oslversion.h> +#endif + #ifdef WITH_OPTIX # include <optix_function_table_definition.h> #endif @@ -65,6 +69,9 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo info.type = DEVICE_OPTIX; info.id += "_OptiX"; +# if defined(WITH_OSL) && (OSL_VERSION_MINOR >= 13 || OSL_VERSION_MAJOR > 1) + info.has_osl = true; +# endif info.denoisers |= DENOISER_OPTIX; devices.push_back(info); diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index 6c64e7106d5..02f34bf3bd0 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -312,16 +312,34 @@ OptiXDevice::~OptiXDevice() if (optix_module != NULL) { optixModuleDestroy(optix_module); } - for (unsigned int i = 0; i < 2; ++i) { + for (int i = 0; i < 2; ++i) { if (builtin_modules[i] != NULL) { optixModuleDestroy(builtin_modules[i]); } } - for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { + for (int i = 0; i < NUM_PIPELINES; ++i) { if (pipelines[i] != NULL) { optixPipelineDestroy(pipelines[i]); } } + for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + if (groups[i] != NULL) { + optixProgramGroupDestroy(groups[i]); + } + } + +# ifdef WITH_OSL + for (const OptixModule &module : osl_modules) { + if (module != NULL) { + optixModuleDestroy(module); + } + } + for (const OptixProgramGroup &group : osl_groups) { + if (group != NULL) { + optixProgramGroupDestroy(group); + } + } +# endif /* Make sure denoiser is destroyed before device context! */ if (denoiser_.optix_denoiser != nullptr) { @@ -381,13 +399,51 @@ bool OptiXDevice::load_kernels(const uint kernel_features) return false; } +# ifdef WITH_OSL + const bool use_osl = (kernel_features & KERNEL_FEATURE_OSL); +# else + const bool use_osl = false; +# endif + + /* Skip creating OptiX module if only doing denoising. */ + const bool need_optix_kernels = (kernel_features & + (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING)); + + /* Detect existence of OptiX kernel and SDK here early. So we can error out + * before compiling the CUDA kernels, to avoid failing right after when + * compiling the OptiX kernel. */ + string suffix = use_osl ? "_osl" : + (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ? + "_shader_raytrace" : + ""; + string ptx_filename; + if (need_optix_kernels) { + ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx"); + if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { + std::string optix_include_dir = get_optix_include_dir(); + if (optix_include_dir.empty()) { + set_error( + "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable " + "to a directory containing the OptiX SDK."); + return false; + } + else if (!path_is_directory(optix_include_dir)) { + set_error(string_printf( + "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install " + "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a " + "directory containing the OptiX SDK.", + optix_include_dir.c_str())); + return false; + } + } + } + /* Load CUDA modules because we need some of the utility kernels. */ if (!CUDADevice::load_kernels(kernel_features)) { return false; } - /* Skip creating OptiX module if only doing denoising. */ - if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) { + if (!need_optix_kernels) { return true; } @@ -398,18 +454,41 @@ bool OptiXDevice::load_kernels(const uint kernel_features) optixModuleDestroy(optix_module); optix_module = NULL; } - for (unsigned int i = 0; i < 2; ++i) { + for (int i = 0; i < 2; ++i) { if (builtin_modules[i] != NULL) { optixModuleDestroy(builtin_modules[i]); builtin_modules[i] = NULL; } } - for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { + for (int i = 0; i < NUM_PIPELINES; ++i) { if (pipelines[i] != NULL) { optixPipelineDestroy(pipelines[i]); pipelines[i] = NULL; } } + for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + if (groups[i] != NULL) { + optixProgramGroupDestroy(groups[i]); + groups[i] = NULL; + } + } + +# ifdef WITH_OSL + /* Recreating base OptiX module invalidates all OSL modules too, since they link against it. */ + for (const OptixModule &module : osl_modules) { + if (module != NULL) { + optixModuleDestroy(module); + } + } + osl_modules.clear(); + + for (const OptixProgramGroup &group : osl_groups) { + if (group != NULL) { + optixProgramGroupDestroy(group); + } + } + osl_groups.clear(); +# endif OptixModuleCompileOptions module_options = {}; module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */ @@ -430,7 +509,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) module_options.numPayloadTypes = 0; # endif - OptixPipelineCompileOptions pipeline_options = {}; /* Default to no motion blur and two-level graph, since it is the fastest option. */ pipeline_options.usesMotionBlur = false; pipeline_options.traversableGraphFlags = @@ -459,9 +537,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds * This is necessary since objects may be reported to have motion if the Vector pass is * active, but may still need to be rendered without motion blur if that isn't active as well. */ - motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0; - - if (motion_blur) { + if (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) { pipeline_options.usesMotionBlur = true; /* Motion blur can insert motion transforms into the traversal graph. * It is no longer a two-level graph then, so need to set flags to allow any configuration. */ @@ -469,33 +545,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } { /* Load and compile PTX module with OptiX kernels. */ - string ptx_data, ptx_filename = path_get( - (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ? - "lib/kernel_optix_shader_raytrace.ptx" : - "lib/kernel_optix.ptx"); + string ptx_data; if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { - std::string optix_include_dir = get_optix_include_dir(); - if (optix_include_dir.empty()) { - set_error( - "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable " - "to a directory containing the OptiX SDK."); - return false; - } - else if (!path_is_directory(optix_include_dir)) { - set_error(string_printf( - "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install " - "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a " - "directory containing the OptiX SDK.", - optix_include_dir.c_str())); - return false; - } - ptx_filename = compile_kernel( - kernel_features, - (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ? - "kernel_shader_raytrace" : - "kernel", - "optix", - true); + string cflags = compile_kernel_get_common_cflags(kernel_features); + ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true); } if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) { set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str())); @@ -537,7 +590,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } /* Create program groups. */ - OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {}; OptixProgramGroupOptions group_options = {}; /* There are no options currently. */ group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; @@ -595,7 +647,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0]; group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr; - if (motion_blur) { + if (pipeline_options.usesMotionBlur) { builtin_options.usesMotionBlur = true; optix_assert(optixBuiltinISModuleGet( @@ -616,7 +668,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } } - /* Pointclouds */ if (kernel_features & KERNEL_FEATURE_POINTCLOUD) { group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD]; group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; @@ -628,8 +679,8 @@ bool OptiXDevice::load_kernels(const uint kernel_features) group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point"; } + /* Add hit group for local intersections. */ if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) { - /* Add hit group for local intersections. */ group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; group_descs[PG_HITL].hitgroup.moduleAH = optix_module; group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit"; @@ -641,16 +692,19 @@ bool OptiXDevice::load_kernels(const uint kernel_features) group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module; group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName = "__raygen__kernel_optix_integrator_shade_surface_raytrace"; - group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module; - group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao"; - group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module; - group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC = - "__direct_callable__svm_node_bevel"; + + /* Kernels with OSL support are built without SVM, so can skip those direct callables there. */ + if (!use_osl) { + group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module; + group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao"; + group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module; + group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC = + "__direct_callable__svm_node_bevel"; + } } - /* MNEE. */ if (kernel_features & KERNEL_FEATURE_MNEE) { group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module; @@ -658,6 +712,42 @@ bool OptiXDevice::load_kernels(const uint kernel_features) "__raygen__kernel_optix_integrator_shade_surface_mnee"; } + /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */ + if (use_osl) { + group_descs[PG_RGEN_SHADE_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_background"; + group_descs[PG_RGEN_SHADE_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_LIGHT].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_LIGHT].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_light"; + group_descs[PG_RGEN_SHADE_SURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_SURFACE].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_SURFACE].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_surface"; + group_descs[PG_RGEN_SHADE_VOLUME].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_VOLUME].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_VOLUME].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_volume"; + group_descs[PG_RGEN_SHADE_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_SHADOW].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_SHADOW].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_shadow"; + group_descs[PG_RGEN_EVAL_DISPLACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_EVAL_DISPLACE].raygen.module = optix_module; + group_descs[PG_RGEN_EVAL_DISPLACE].raygen.entryFunctionName = + "__raygen__kernel_optix_shader_eval_displace"; + group_descs[PG_RGEN_EVAL_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.module = optix_module; + group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.entryFunctionName = + "__raygen__kernel_optix_shader_eval_background"; + group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.module = optix_module; + group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.entryFunctionName = + "__raygen__kernel_optix_shader_eval_curve_shadow_transparency"; + } + optix_assert(optixProgramGroupCreate( context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); @@ -666,7 +756,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) /* Set up SBT, which in this case is used only to select between different programs. */ sbt_data.alloc(NUM_PROGRAM_GROUPS); memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS); - for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i])); optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i])); } @@ -690,25 +780,26 @@ bool OptiXDevice::load_kernels(const uint kernel_features) OptixPipelineLinkOptions link_options = {}; link_options.maxTraceDepth = 1; + link_options.debugLevel = module_options.debugLevel; - if (DebugFlags().optix.use_debug) { - link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; - } - else { - link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE; - } - - if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { - /* Create shader raytracing pipeline. */ + if (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE) && !use_osl) { + /* Create shader raytracing and MNEE pipeline. */ vector<OptixProgramGroup> pipeline_groups; pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]); + if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]); + pipeline_groups.push_back(groups[PG_CALL_SVM_AO]); + pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]); + } + if (kernel_features & KERNEL_FEATURE_MNEE) { + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]); + } pipeline_groups.push_back(groups[PG_MISS]); pipeline_groups.push_back(groups[PG_HITD]); pipeline_groups.push_back(groups[PG_HITS]); pipeline_groups.push_back(groups[PG_HITL]); pipeline_groups.push_back(groups[PG_HITV]); - if (motion_blur) { + if (pipeline_options.usesMotionBlur) { pipeline_groups.push_back(groups[PG_HITD_MOTION]); pipeline_groups.push_back(groups[PG_HITS_MOTION]); } @@ -716,8 +807,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]); pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]); } - pipeline_groups.push_back(groups[PG_CALL_SVM_AO]); - pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]); optix_assert(optixPipelineCreate(context, &pipeline_options, @@ -726,30 +815,33 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.size(), nullptr, 0, - &pipelines[PIP_SHADE_RAYTRACE])); + &pipelines[PIP_SHADE])); /* Combine ray generation and trace continuation stack size. */ - const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG + + const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG, + stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG) + link_options.maxTraceDepth * trace_css; const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC, stack_size[PG_CALL_SVM_BEVEL].dssDC); /* Set stack size depending on pipeline options. */ optix_assert(optixPipelineSetStackSize( - pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2)); + pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2)); } - if (kernel_features & KERNEL_FEATURE_MNEE) { - /* Create MNEE pipeline. */ + { /* Create intersection-only pipeline. */ vector<OptixProgramGroup> pipeline_groups; pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]); pipeline_groups.push_back(groups[PG_MISS]); pipeline_groups.push_back(groups[PG_HITD]); pipeline_groups.push_back(groups[PG_HITS]); pipeline_groups.push_back(groups[PG_HITL]); pipeline_groups.push_back(groups[PG_HITV]); - if (motion_blur) { + if (pipeline_options.usesMotionBlur) { pipeline_groups.push_back(groups[PG_HITD_MOTION]); pipeline_groups.push_back(groups[PG_HITS_MOTION]); } @@ -757,8 +849,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]); pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]); } - pipeline_groups.push_back(groups[PG_CALL_SVM_AO]); - pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]); optix_assert(optixPipelineCreate(context, &pipeline_options, @@ -767,37 +857,234 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.size(), nullptr, 0, - &pipelines[PIP_SHADE_MNEE])); + &pipelines[PIP_INTERSECT])); - /* Combine ray generation and trace continuation stack size. */ - const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG + - link_options.maxTraceDepth * trace_css; - const unsigned int dss = 0; + /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */ + const unsigned int css = + std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG, + std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG, + std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG, + stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) + + link_options.maxTraceDepth * trace_css; - /* Set stack size depending on pipeline options. */ - optix_assert( - optixPipelineSetStackSize(pipelines[PIP_SHADE_MNEE], 0, dss, css, motion_blur ? 3 : 2)); + optix_assert(optixPipelineSetStackSize( + pipelines[PIP_INTERSECT], 0, 0, css, pipeline_options.usesMotionBlur ? 3 : 2)); } - { /* Create intersection-only pipeline. */ + return !have_error(); +} + +bool OptiXDevice::load_osl_kernels() +{ +# ifdef WITH_OSL + if (have_error()) { + return false; + } + + struct OSLKernel { + string ptx; + string init_entry; + string exec_entry; + }; + + /* This has to be in the same order as the ShaderType enum, so that the index calculation in + * osl_eval_nodes checks out */ + vector<OSLKernel> osl_kernels; + + for (ShaderType type = SHADER_TYPE_SURFACE; type <= SHADER_TYPE_BUMP; + type = static_cast<ShaderType>(type + 1)) { + const vector<OSL::ShaderGroupRef> &groups = (type == SHADER_TYPE_SURFACE ? + osl_globals.surface_state : + type == SHADER_TYPE_VOLUME ? + osl_globals.volume_state : + type == SHADER_TYPE_DISPLACEMENT ? + osl_globals.displacement_state : + osl_globals.bump_state); + for (const OSL::ShaderGroupRef &group : groups) { + if (group) { + string osl_ptx, init_name, entry_name; + osl_globals.ss->getattribute(group.get(), "group_init_name", init_name); + osl_globals.ss->getattribute(group.get(), "group_entry_name", entry_name); + osl_globals.ss->getattribute( + group.get(), "ptx_compiled_version", OSL::TypeDesc::PTR, &osl_ptx); + + int groupdata_size = 0; + osl_globals.ss->getattribute(group.get(), "groupdata_size", groupdata_size); + if (groupdata_size > 2048) { /* See 'group_data' array in kernel/osl/osl.h */ + set_error( + string_printf("Requested OSL group data size (%d) is greater than the maximum " + "supported with OptiX (2048)", + groupdata_size)); + return false; + } + + osl_kernels.push_back({std::move(osl_ptx), std::move(init_name), std::move(entry_name)}); + } + else { + /* Add empty entry for non-existent shader groups, so that the index stays stable. */ + osl_kernels.emplace_back(); + } + } + } + + const CUDAContextScope scope(this); + + if (pipelines[PIP_SHADE]) { + optixPipelineDestroy(pipelines[PIP_SHADE]); + } + + for (OptixModule &module : osl_modules) { + if (module != NULL) { + optixModuleDestroy(module); + module = NULL; + } + } + for (OptixProgramGroup &group : osl_groups) { + if (group != NULL) { + optixProgramGroupDestroy(group); + group = NULL; + } + } + + OptixProgramGroupOptions group_options = {}; /* There are no options currently. */ + OptixModuleCompileOptions module_options = {}; + module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3; + module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE; + + osl_groups.resize(osl_kernels.size() * 2 + 1); + osl_modules.resize(osl_kernels.size() + 1); + + { /* Load and compile PTX module with OSL services. */ + string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx"); + if (!path_read_text(ptx_filename, ptx_data)) { + set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'", + ptx_filename.c_str())); + return false; + } + + const OptixResult result = optixModuleCreateFromPTX(context, + &module_options, + &pipeline_options, + ptx_data.data(), + ptx_data.size(), + nullptr, + 0, + &osl_modules.back()); + if (result != OPTIX_SUCCESS) { + set_error(string_printf("Failed to load OptiX OSL services kernel from '%s' (%s)", + ptx_filename.c_str(), + optixGetErrorName(result))); + return false; + } + + OptixProgramGroupDesc group_desc = {}; + group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_services"; + group_desc.callables.moduleDC = osl_modules.back(); + + optix_assert(optixProgramGroupCreate( + context, &group_desc, 1, &group_options, nullptr, 0, &osl_groups.back())); + } + + TaskPool pool; + vector<OptixResult> results(osl_kernels.size(), OPTIX_SUCCESS); + + for (size_t i = 0; i < osl_kernels.size(); ++i) { + if (osl_kernels[i].ptx.empty()) { + continue; + } + +# if OPTIX_ABI_VERSION >= 55 + OptixTask task = nullptr; + results[i] = optixModuleCreateFromPTXWithTasks(context, + &module_options, + &pipeline_options, + osl_kernels[i].ptx.data(), + osl_kernels[i].ptx.size(), + nullptr, + nullptr, + &osl_modules[i], + &task); + if (results[i] == OPTIX_SUCCESS) { + execute_optix_task(pool, task, results[i]); + } +# else + pool.push([this, &results, i, &module_options, &osl_kernels]() { + results[i] = optixModuleCreateFromPTX(context, + &module_options, + &pipeline_options, + osl_kernels[i].ptx.data(), + osl_kernels[i].ptx.size(), + nullptr, + 0, + &osl_modules[i]); + }); +# endif + } + + pool.wait_work(); + + for (size_t i = 0; i < osl_kernels.size(); ++i) { + if (osl_kernels[i].ptx.empty()) { + continue; + } + + if (results[i] != OPTIX_SUCCESS) { + set_error(string_printf("Failed to load OptiX OSL kernel for %s (%s)", + osl_kernels[i].init_entry.c_str(), + optixGetErrorName(results[i]))); + return false; + } + + OptixProgramGroupDesc group_descs[2] = {}; + group_descs[0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[0].callables.entryFunctionNameDC = osl_kernels[i].init_entry.c_str(); + group_descs[0].callables.moduleDC = osl_modules[i]; + group_descs[1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[1].callables.entryFunctionNameDC = osl_kernels[i].exec_entry.c_str(); + group_descs[1].callables.moduleDC = osl_modules[i]; + + optix_assert(optixProgramGroupCreate( + context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2])); + } + + vector<OptixStackSizes> osl_stack_size(osl_groups.size()); + + /* Update SBT with new entries. */ + sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size()); + for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i])); + } + for (size_t i = 0; i < osl_groups.size(); ++i) { + if (osl_groups[i] != NULL) { + optix_assert(optixSbtRecordPackHeader(osl_groups[i], &sbt_data[NUM_PROGRAM_GROUPS + i])); + optix_assert(optixProgramGroupGetStackSize(osl_groups[i], &osl_stack_size[i])); + } + } + sbt_data.copy_to_device(); /* Upload updated SBT to device. */ + + OptixPipelineLinkOptions link_options = {}; + link_options.maxTraceDepth = 0; + link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE; + + { vector<OptixProgramGroup> pipeline_groups; pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]); - pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]); - pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]); - pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]); - pipeline_groups.push_back(groups[PG_MISS]); - pipeline_groups.push_back(groups[PG_HITD]); - pipeline_groups.push_back(groups[PG_HITS]); - pipeline_groups.push_back(groups[PG_HITL]); - pipeline_groups.push_back(groups[PG_HITV]); - if (motion_blur) { - pipeline_groups.push_back(groups[PG_HITD_MOTION]); - pipeline_groups.push_back(groups[PG_HITS_MOTION]); - } - if (kernel_features & KERNEL_FEATURE_POINTCLOUD) { - pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]); - pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_BACKGROUND]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_LIGHT]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_VOLUME]); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SHADOW]); + pipeline_groups.push_back(groups[PG_RGEN_EVAL_DISPLACE]); + pipeline_groups.push_back(groups[PG_RGEN_EVAL_BACKGROUND]); + pipeline_groups.push_back(groups[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY]); + + for (const OptixProgramGroup &group : osl_groups) { + if (group != NULL) { + pipeline_groups.push_back(group); + } } optix_assert(optixPipelineCreate(context, @@ -807,26 +1094,30 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.size(), nullptr, 0, - &pipelines[PIP_INTERSECT])); + &pipelines[PIP_SHADE])); - /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */ - const unsigned int css = - std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG, - std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG, - std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG, - stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) + - link_options.maxTraceDepth * trace_css; + unsigned int dss = 0; + for (unsigned int i = 0; i < osl_stack_size.size(); ++i) { + dss = std::max(dss, osl_stack_size[i].dssDC); + } - optix_assert( - optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2)); + optix_assert(optixPipelineSetStackSize( + pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2)); } - /* Clean up program group objects. */ - for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { - optixProgramGroupDestroy(groups[i]); - } + return !have_error(); +# else + return false; +# endif +} - return true; +void *OptiXDevice::get_cpu_osl_memory() +{ +# ifdef WITH_OSL + return &osl_globals; +# else + return NULL; +# endif } /* -------------------------------------------------------------------- @@ -1553,7 +1844,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) size_t num_motion_steps = 1; Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && hair->get_use_motion_blur() && motion_keys) { + if (pipeline_options.usesMotionBlur && hair->get_use_motion_blur() && motion_keys) { num_motion_steps = hair->get_motion_steps(); } @@ -1707,7 +1998,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) size_t num_motion_steps = 1; Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && mesh->get_use_motion_blur() && motion_keys) { + if (pipeline_options.usesMotionBlur && mesh->get_use_motion_blur() && motion_keys) { num_motion_steps = mesh->get_motion_steps(); } @@ -1774,7 +2065,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) size_t num_motion_steps = 1; Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && pointcloud->get_use_motion_blur() && motion_points) { + if (pipeline_options.usesMotionBlur && pointcloud->get_use_motion_blur() && motion_points) { num_motion_steps = pointcloud->get_motion_steps(); } @@ -1871,7 +2162,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) /* Calculate total motion transform size and allocate memory for them. */ size_t motion_transform_offset = 0; - if (motion_blur) { + if (pipeline_options.usesMotionBlur) { size_t total_motion_transform_size = 0; for (Object *const ob : bvh->objects) { if (ob->is_traceable() && ob->use_motion()) { @@ -1922,7 +2213,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) if (ob->get_geometry()->geometry_type == Geometry::HAIR && static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { - if (motion_blur && ob->get_geometry()->has_motion_blur()) { + if (pipeline_options.usesMotionBlur && ob->get_geometry()->has_motion_blur()) { /* Select between motion blur and non-motion blur built-in intersection module. */ instance.sbtOffset = PG_HITD_MOTION - PG_HITD; } @@ -1950,7 +2241,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) } /* Insert motion traversable if object has motion. */ - if (motion_blur && ob->use_motion()) { + if (pipeline_options.usesMotionBlur && ob->use_motion()) { size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2; size_t motion_transform_size = sizeof(OptixSRTMotionTransform) + motion_keys * sizeof(OptixSRTData); diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h index 817afdc8384..ad0e7b93454 100644 --- a/intern/cycles/device/optix/device_impl.h +++ b/intern/cycles/device/optix/device_impl.h @@ -9,6 +9,7 @@ # include "device/cuda/device_impl.h" # include "device/optix/queue.h" # include "device/optix/util.h" +# include "kernel/osl/globals.h" # include "kernel/types.h" # include "util/unique_ptr.h" @@ -23,8 +24,16 @@ enum { PG_RGEN_INTERSECT_SHADOW, PG_RGEN_INTERSECT_SUBSURFACE, PG_RGEN_INTERSECT_VOLUME_STACK, + PG_RGEN_SHADE_BACKGROUND, + PG_RGEN_SHADE_LIGHT, + PG_RGEN_SHADE_SURFACE, PG_RGEN_SHADE_SURFACE_RAYTRACE, PG_RGEN_SHADE_SURFACE_MNEE, + PG_RGEN_SHADE_VOLUME, + PG_RGEN_SHADE_SHADOW, + PG_RGEN_EVAL_DISPLACE, + PG_RGEN_EVAL_BACKGROUND, + PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY, PG_MISS, PG_HITD, /* Default hit group. */ PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */ @@ -40,14 +49,14 @@ enum { }; static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS; -static const int NUM_MIS_PROGRAM_GROUPS = 1; +static const int NUM_MISS_PROGRAM_GROUPS = 1; static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD; static const int NUM_HIT_PROGRAM_GROUPS = 8; static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO; static const int NUM_CALLABLE_PROGRAM_GROUPS = 2; /* List of OptiX pipelines. */ -enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES }; +enum { PIP_SHADE, PIP_INTERSECT, NUM_PIPELINES }; /* A single shader binding table entry. */ struct SbtRecord { @@ -61,12 +70,20 @@ class OptiXDevice : public CUDADevice { OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */ OptixModule builtin_modules[2] = {}; OptixPipeline pipelines[NUM_PIPELINES] = {}; + OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; + OptixPipelineCompileOptions pipeline_options = {}; - bool motion_blur = false; device_vector<SbtRecord> sbt_data; device_only_memory<KernelParamsOptiX> launch_params; - OptixTraversableHandle tlas_handle = 0; +# ifdef WITH_OSL + OSLGlobals osl_globals; + vector<OptixModule> osl_modules; + vector<OptixProgramGroup> osl_groups; +# endif + + private: + OptixTraversableHandle tlas_handle = 0; vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory; thread_mutex delayed_free_bvh_mutex; @@ -100,13 +117,14 @@ class OptiXDevice : public CUDADevice { OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); ~OptiXDevice(); - private: BVHLayoutMask get_bvh_layout_mask() const override; - string compile_kernel_get_common_cflags(const uint kernel_features) override; + string compile_kernel_get_common_cflags(const uint kernel_features); bool load_kernels(const uint kernel_features) override; + bool load_osl_kernels() override; + bool build_optix_bvh(BVHOptiX *bvh, OptixBuildOperation operation, const OptixBuildInput &build_input, @@ -123,6 +141,8 @@ class OptiXDevice : public CUDADevice { virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + void *get_cpu_osl_memory() override; + /* -------------------------------------------------------------------- * Denoising. */ diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp index 3bc547ed11d..1bfd154d449 100644 --- a/intern/cycles/device/optix/queue.cpp +++ b/intern/cycles/device/optix/queue.cpp @@ -24,21 +24,33 @@ void OptiXDeviceQueue::init_execution() CUDADeviceQueue::init_execution(); } -static bool is_optix_specific_kernel(DeviceKernel kernel) +static bool is_optix_specific_kernel(DeviceKernel kernel, bool use_osl) { - return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || - kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE || - kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || - kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || - kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || - kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); +# ifdef WITH_OSL + /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */ + if (use_osl && device_kernel_has_shading(kernel)) { + return true; + } +# else + (void)use_osl; +# endif + + return device_kernel_has_intersection(kernel); } bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, DeviceKernelArguments const &args) { - if (!is_optix_specific_kernel(kernel)) { + OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_); + +# ifdef WITH_OSL + const bool use_osl = static_cast<OSLGlobals *>(optix_device->get_cpu_osl_memory())->use; +# else + const bool use_osl = false; +# endif + + if (!is_optix_specific_kernel(kernel, use_osl)) { return CUDADeviceQueue::enqueue(kernel, work_size, args); } @@ -50,8 +62,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const CUDAContextScope scope(cuda_device_); - OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_); - const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer; const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer; @@ -62,9 +72,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, sizeof(device_ptr), cuda_stream_)); - if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || - kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || - kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) { + if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel_has_shading(kernel)) { cuda_device_assert( cuda_device_, cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer), @@ -72,6 +80,15 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, sizeof(device_ptr), cuda_stream_)); } + if (kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE || + kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND || + kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY) { + cuda_device_assert(cuda_device_, + cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, offset), + args.values[2], // &d_offset + sizeof(int32_t), + cuda_stream_)); + } cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); @@ -79,14 +96,35 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, OptixShaderBindingTable sbt_params = {}; switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_BACKGROUND * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_LIGHT * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE * sizeof(SbtRecord); + break; case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: - pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE]; + pipeline = optix_device->pipelines[PIP_SHADE]; sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord); break; case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: - pipeline = optix_device->pipelines[PIP_SHADE_MNEE]; + pipeline = optix_device->pipelines[PIP_SHADE]; sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord); break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_VOLUME * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SHADOW * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: pipeline = optix_device->pipelines[PIP_INTERSECT]; sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord); @@ -104,6 +142,20 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord); break; + case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_DISPLACE * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_BACKGROUND * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY: + pipeline = optix_device->pipelines[PIP_SHADE]; + sbt_params.raygenRecord = sbt_data_ptr + + PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY * sizeof(SbtRecord); + break; + default: LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel) << " is attempted to be enqueued."; @@ -112,7 +164,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord); sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); - sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS; + sbt_params.missRecordCount = NUM_MISS_PROGRAM_GROUPS; sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord); sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS; @@ -120,6 +172,12 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS; sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); +# ifdef WITH_OSL + if (use_osl) { + sbt_params.callablesRecordCount += static_cast<unsigned int>(optix_device->osl_groups.size()); + } +# endif + /* Launch the ray generation program. */ optix_device_assert(optix_device, optixLaunch(pipeline, diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 3779fdc697a..99f9e536977 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -37,6 +37,14 @@ set(SRC_KERNEL_DEVICE_OPTIX device/optix/kernel_shader_raytrace.cu ) +if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1)) + set(SRC_KERNEL_DEVICE_OPTIX + ${SRC_KERNEL_DEVICE_OPTIX} + osl/services_optix.cu + device/optix/kernel_osl.cu + ) +endif() + set(SRC_KERNEL_DEVICE_ONEAPI device/oneapi/kernel.cpp ) @@ -181,6 +189,16 @@ set(SRC_KERNEL_SVM_HEADERS svm/vertex_color.h ) +if(WITH_CYCLES_OSL) + set(SRC_KERNEL_OSL_HEADERS + osl/osl.h + osl/closures_setup.h + osl/closures_template.h + osl/services_gpu.h + osl/types.h + ) +endif() + set(SRC_KERNEL_GEOM_HEADERS geom/geom.h geom/attribute.h @@ -306,6 +324,7 @@ set(SRC_KERNEL_HEADERS ${SRC_KERNEL_GEOM_HEADERS} ${SRC_KERNEL_INTEGRATOR_HEADERS} ${SRC_KERNEL_LIGHT_HEADERS} + ${SRC_KERNEL_OSL_HEADERS} ${SRC_KERNEL_SAMPLE_HEADERS} ${SRC_KERNEL_SVM_HEADERS} ${SRC_KERNEL_TYPES_HEADERS} @@ -328,6 +347,7 @@ set(SRC_UTIL_HEADERS ../util/math_int2.h ../util/math_int3.h ../util/math_int4.h + ../util/math_int8.h ../util/math_matrix.h ../util/projection.h ../util/rect.h @@ -350,6 +370,8 @@ set(SRC_UTIL_HEADERS ../util/types_int3_impl.h ../util/types_int4.h ../util/types_int4_impl.h + ../util/types_int8.h + ../util/types_int8_impl.h ../util/types_spectrum.h ../util/types_uchar2.h ../util/types_uchar2_impl.h @@ -705,6 +727,16 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) kernel_optix_shader_raytrace "device/optix/kernel_shader_raytrace.cu" "--keep-device-functions") + if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1)) + CYCLES_OPTIX_KERNEL_ADD( + kernel_optix_osl + "device/optix/kernel_osl.cu" + "--relocatable-device-code=true") + CYCLES_OPTIX_KERNEL_ADD( + kernel_optix_osl_services + "osl/services_optix.cu" + "--relocatable-device-code=true") + endif() add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx}) cycles_set_solution_folder(cycles_kernel_optix) @@ -992,6 +1024,7 @@ source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS}) source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS}) source_group("kernel" FILES ${SRC_KERNEL_TYPES_HEADERS}) source_group("light" FILES ${SRC_KERNEL_LIGHT_HEADERS}) +source_group("osl" FILES ${SRC_KERNEL_OSL_HEADERS}) source_group("sample" FILES ${SRC_KERNEL_SAMPLE_HEADERS}) source_group("svm" FILES ${SRC_KERNEL_SVM_HEADERS}) source_group("util" FILES ${SRC_KERNEL_UTIL_HEADERS}) @@ -1028,6 +1061,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLE delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_LIGHT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/light) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_OSL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/osl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SAMPLE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/sample) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_TYPES_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 71af68aa80e..2f5c5d7bd0c 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -297,8 +297,10 @@ ccl_device_inline void bsdf_roughness_eta(const KernelGlobals kg, ccl_private float2 *roughness, ccl_private float *eta) { +#ifdef __SVM__ bool refractive = false; float alpha = 1.0f; +#endif switch (sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: *roughness = one_float2(); diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp index 01087c96dd6..558431961ab 100644 --- a/intern/cycles/kernel/device/cpu/kernel.cpp +++ b/intern/cycles/kernel/device/cpu/kernel.cpp @@ -7,6 +7,7 @@ * one with SSE2 intrinsics. */ #if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ #endif @@ -29,11 +30,15 @@ # define __KERNEL_SSE41__ # endif # ifdef __AVX__ -# define __KERNEL_SSE__ +# ifndef __KERNEL_SSE__ +# define __KERNEL_SSE__ +# endif # define __KERNEL_AVX__ # endif # ifdef __AVX2__ -# define __KERNEL_SSE__ +# ifndef __KERNEL_SSE__ +# define __KERNEL_SSE__ +# endif # define __KERNEL_AVX2__ # endif #endif diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h index 51e1381d552..3a950779c11 100644 --- a/intern/cycles/kernel/device/cuda/compat.h +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -30,6 +30,7 @@ typedef unsigned long long uint64_t; /* Qualifiers */ #define ccl_device __device__ __inline__ +#define ccl_device_extern extern "C" __device__ #if __CUDA_ARCH__ < 500 # define ccl_device_inline __device__ __forceinline__ # define ccl_device_forceinline __device__ __forceinline__ @@ -109,14 +110,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D typedef unsigned short half; -__device__ half __float2half(const float f) +ccl_device_forceinline half __float2half(const float f) { half val; asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); return val; } -__device__ float __half2float(const half h) +ccl_device_forceinline float __half2float(const half h) { float val; asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h)); diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h index 648988c31b6..8755395c82c 100644 --- a/intern/cycles/kernel/device/hip/compat.h +++ b/intern/cycles/kernel/device/hip/compat.h @@ -28,6 +28,7 @@ typedef unsigned long long uint64_t; /* Qualifiers */ #define ccl_device __device__ __inline__ +#define ccl_device_extern extern "C" __device__ #define ccl_device_inline __device__ __inline__ #define ccl_device_forceinline __device__ __forceinline__ #define ccl_device_noinline __device__ __noinline__ diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index f689e93e5a2..2dd6cc98b59 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -38,6 +38,7 @@ using namespace metal::raytracing; # define ccl_device_noinline ccl_device __attribute__((noinline)) #endif +#define ccl_device_extern extern "C" #define ccl_device_noinline_cpu ccl_device #define ccl_device_inline_method ccl_device #define ccl_global device diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h index dfaec65130c..b83512180d7 100644 --- a/intern/cycles/kernel/device/oneapi/compat.h +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -28,6 +28,7 @@ /* Qualifier wrappers for different names on different devices */ #define ccl_device +#define ccl_device_extern extern "C" #define ccl_global #define ccl_always_inline __attribute__((always_inline)) #define ccl_device_inline inline diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h index 1a11a533b7e..e13101f57b8 100644 --- a/intern/cycles/kernel/device/optix/compat.h +++ b/intern/cycles/kernel/device/optix/compat.h @@ -33,14 +33,16 @@ typedef unsigned long long uint64_t; #endif #define ccl_device \ - __device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything + static __device__ \ + __forceinline__ // Function calls are bad for OptiX performance, so inline everything +#define ccl_device_extern extern "C" __device__ #define ccl_device_inline ccl_device #define ccl_device_forceinline ccl_device -#define ccl_device_inline_method ccl_device -#define ccl_device_noinline __device__ __noinline__ +#define ccl_device_inline_method __device__ __forceinline__ +#define ccl_device_noinline static __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device #define ccl_global -#define ccl_inline_constant __constant__ +#define ccl_inline_constant static __constant__ #define ccl_device_constant __constant__ __device__ #define ccl_constant const #define ccl_gpu_shared __shared__ @@ -57,23 +59,6 @@ typedef unsigned long long uint64_t; #define kernel_assert(cond) -/* GPU thread, block, grid size and index */ - -#define ccl_gpu_thread_idx_x (threadIdx.x) -#define ccl_gpu_block_dim_x (blockDim.x) -#define ccl_gpu_block_idx_x (blockIdx.x) -#define ccl_gpu_grid_dim_x (gridDim.x) -#define ccl_gpu_warp_size (warpSize) -#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) - -#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) -#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) - -/* GPU warp synchronization. */ - -#define ccl_gpu_syncthreads() __syncthreads() -#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) - /* GPU texture objects */ typedef unsigned long long CUtexObject; @@ -101,14 +86,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D typedef unsigned short half; -__device__ half __float2half(const float f) +ccl_device_forceinline half __float2half(const float f) { half val; asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); return val; } -__device__ float __half2float(const half h) +ccl_device_forceinline float __half2float(const half h) { float val; asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h)); diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h index 7af2e421378..126df74bc8c 100644 --- a/intern/cycles/kernel/device/optix/globals.h +++ b/intern/cycles/kernel/device/optix/globals.h @@ -25,6 +25,7 @@ struct KernelParamsOptiX { /* Kernel arguments */ const int *path_index_array; float *render_buffer; + int offset; /* Global scene data and textures */ KernelData data; @@ -36,7 +37,11 @@ struct KernelParamsOptiX { }; #ifdef __NVCC__ -extern "C" static __constant__ KernelParamsOptiX kernel_params; +extern "C" +# ifndef __CUDACC_RDC__ + static +# endif + __constant__ KernelParamsOptiX kernel_params; #endif /* Abstraction macros */ diff --git a/intern/cycles/kernel/device/optix/kernel_osl.cu b/intern/cycles/kernel/device/optix/kernel_osl.cu new file mode 100644 index 00000000000..0f3f477935b --- /dev/null +++ b/intern/cycles/kernel/device/optix/kernel_osl.cu @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#define WITH_OSL + +/* Copy of the regular OptiX kernels with additional OSL support. */ + +#include "kernel/device/optix/kernel_shader_raytrace.cu" + +#include "kernel/bake/bake.h" +#include "kernel/integrator/shade_background.h" +#include "kernel/integrator/shade_light.h" +#include "kernel/integrator/shade_shadow.h" +#include "kernel/integrator/shade_volume.h" + +extern "C" __global__ void __raygen__kernel_optix_integrator_shade_background() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (kernel_params.path_index_array) ? + kernel_params.path_index_array[global_index] : + global_index; + integrator_shade_background(nullptr, path_index, kernel_params.render_buffer); +} + +extern "C" __global__ void __raygen__kernel_optix_integrator_shade_light() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (kernel_params.path_index_array) ? + kernel_params.path_index_array[global_index] : + global_index; + integrator_shade_light(nullptr, path_index, kernel_params.render_buffer); +} + +extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (kernel_params.path_index_array) ? + kernel_params.path_index_array[global_index] : + global_index; + integrator_shade_surface(nullptr, path_index, kernel_params.render_buffer); +} + +extern "C" __global__ void __raygen__kernel_optix_integrator_shade_volume() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (kernel_params.path_index_array) ? + kernel_params.path_index_array[global_index] : + global_index; + integrator_shade_volume(nullptr, path_index, kernel_params.render_buffer); +} + +extern "C" __global__ void __raygen__kernel_optix_integrator_shade_shadow() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (kernel_params.path_index_array) ? + kernel_params.path_index_array[global_index] : + global_index; + integrator_shade_shadow(nullptr, path_index, kernel_params.render_buffer); +} + +extern "C" __global__ void __raygen__kernel_optix_shader_eval_displace() +{ + KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array; + float *const output = kernel_params.render_buffer; + const int global_index = kernel_params.offset + optixGetLaunchIndex().x; + kernel_displace_evaluate(nullptr, input, output, global_index); +} + +extern "C" __global__ void __raygen__kernel_optix_shader_eval_background() +{ + KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array; + float *const output = kernel_params.render_buffer; + const int global_index = kernel_params.offset + optixGetLaunchIndex().x; + kernel_background_evaluate(nullptr, input, output, global_index); +} + +extern "C" __global__ void __raygen__kernel_optix_shader_eval_curve_shadow_transparency() +{ + KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array; + float *const output = kernel_params.render_buffer; + const int global_index = kernel_params.offset + optixGetLaunchIndex().x; + kernel_curve_shadow_transparency_evaluate(nullptr, input, output, global_index); +} diff --git a/intern/cycles/kernel/integrator/displacement_shader.h b/intern/cycles/kernel/integrator/displacement_shader.h index 839dfe244ac..a6e9d674396 100644 --- a/intern/cycles/kernel/integrator/displacement_shader.h +++ b/intern/cycles/kernel/integrator/displacement_shader.h @@ -24,8 +24,8 @@ ccl_device void displacement_shader_eval(KernelGlobals kg, /* this will modify sd->P */ #ifdef __OSL__ - if (kg->osl) { - OSLShader::eval_displacement(kg, state, sd); + if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) { + osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(kg, state, sd, 0); } else #endif diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h index 667ba949760..cc3fbe3fe39 100644 --- a/intern/cycles/kernel/integrator/init_from_bake.h +++ b/intern/cycles/kernel/integrator/init_from_bake.h @@ -156,6 +156,13 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg, u = v; v = 1.0f - tmp - v; + const float tmpdx = dudx; + const float tmpdy = dudy; + dudx = dvdx; + dudy = dvdy; + dvdx = -tmpdx - dvdx; + dvdy = -tmpdy - dvdy; + /* Position and normal on triangle. */ const int object = kernel_data.bake.object_index; float3 P, Ng; diff --git a/intern/cycles/kernel/integrator/surface_shader.h b/intern/cycles/kernel/integrator/surface_shader.h index 6c0097b11bd..5e47a34f77e 100644 --- a/intern/cycles/kernel/integrator/surface_shader.h +++ b/intern/cycles/kernel/integrator/surface_shader.h @@ -827,13 +827,8 @@ ccl_device void surface_shader_eval(KernelGlobals kg, sd->num_closure_left = max_closures; #ifdef __OSL__ - if (kg->osl) { - if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) { - OSLShader::eval_background(kg, state, sd, path_flag); - } - else { - OSLShader::eval_surface(kg, state, sd, path_flag); - } + if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) { + osl_eval_nodes<SHADER_TYPE_SURFACE>(kg, state, sd, path_flag); } else #endif diff --git a/intern/cycles/kernel/integrator/volume_shader.h b/intern/cycles/kernel/integrator/volume_shader.h index 0ff968723a1..f9050647c6d 100644 --- a/intern/cycles/kernel/integrator/volume_shader.h +++ b/intern/cycles/kernel/integrator/volume_shader.h @@ -493,8 +493,8 @@ ccl_device_inline void volume_shader_eval(KernelGlobals kg, /* evaluate shader */ # ifdef __OSL__ - if (kg->osl) { - OSLShader::eval_volume(kg, state, sd, path_flag); + if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) { + osl_eval_nodes<SHADER_TYPE_VOLUME>(kg, state, sd, path_flag); } else # endif diff --git a/intern/cycles/kernel/osl/closures.cpp b/intern/cycles/kernel/osl/closures.cpp index d56e0551a91..6800c765345 100644 --- a/intern/cycles/kernel/osl/closures.cpp +++ b/intern/cycles/kernel/osl/closures.cpp @@ -25,13 +25,18 @@ #include "kernel/osl/osl.h" -#include "kernel/osl/closures_setup.h" - #define TO_VEC3(v) OSL::Vec3(v.x, v.y, v.z) #define TO_FLOAT3(v) make_float3(v[0], v[1], v[2]) CCL_NAMESPACE_BEGIN +static_assert(sizeof(OSLClosure) == sizeof(OSL::ClosureColor) && + sizeof(OSLClosureAdd) == sizeof(OSL::ClosureAdd) && + sizeof(OSLClosureMul) == sizeof(OSL::ClosureMul) && + sizeof(OSLClosureComponent) == sizeof(OSL::ClosureComponent)); +static_assert(sizeof(ShaderGlobals) == sizeof(OSL::ShaderGlobals) && + offsetof(ShaderGlobals, Ci) == offsetof(OSL::ShaderGlobals, Ci)); + /* Registration */ #define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \ @@ -60,53 +65,18 @@ void OSLRenderServices::register_closures(OSL::ShadingSystem *ss) #include "closures_template.h" } -/* Globals */ +/* Surface & Background */ -static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg, - ShaderData *sd, - const void *state, - uint32_t path_flag, - OSLThreadData *tdata) +template<> +void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg, + const void *state, + ShaderData *sd, + uint32_t path_flag) { - OSL::ShaderGlobals *globals = &tdata->globals; - - const differential3 dP = differential_from_compact(sd->Ng, sd->dP); - const differential3 dI = differential_from_compact(sd->I, sd->dI); - - /* copy from shader data to shader globals */ - globals->P = TO_VEC3(sd->P); - globals->dPdx = TO_VEC3(dP.dx); - globals->dPdy = TO_VEC3(dP.dy); - globals->I = TO_VEC3(sd->I); - globals->dIdx = TO_VEC3(dI.dx); - globals->dIdy = TO_VEC3(dI.dy); - globals->N = TO_VEC3(sd->N); - globals->Ng = TO_VEC3(sd->Ng); - globals->u = sd->u; - globals->dudx = sd->du.dx; - globals->dudy = sd->du.dy; - globals->v = sd->v; - globals->dvdx = sd->dv.dx; - globals->dvdy = sd->dv.dy; - globals->dPdu = TO_VEC3(sd->dPdu); - globals->dPdv = TO_VEC3(sd->dPdv); - globals->surfacearea = 1.0f; - globals->time = sd->time; - - /* booleans */ - globals->raytype = path_flag; - globals->flipHandedness = 0; - globals->backfacing = (sd->flag & SD_BACKFACING); - - /* shader data to be used in services callbacks */ - globals->renderstate = sd; - - /* hacky, we leave it to services to fetch actual object matrix */ - globals->shader2common = sd; - globals->object2common = sd; - - /* must be set to NULL before execute */ - globals->Ci = NULL; + /* setup shader globals from shader data */ + OSLThreadData *tdata = kg->osl_tdata; + shaderdata_to_shaderglobals( + kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals)); /* clear trace data */ tdata->tracedata.init = false; @@ -121,53 +91,6 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg, sd->osl_path_state = (const IntegratorStateCPU *)state; sd->osl_shadow_path_state = nullptr; } -} - -static void flatten_closure_tree(const KernelGlobalsCPU *kg, - ShaderData *sd, - uint32_t path_flag, - const OSL::ClosureColor *closure, - float3 weight = make_float3(1.0f, 1.0f, 1.0f)) -{ - /* OSL gives us a closure tree, we flatten it into arrays per - * closure type, for evaluation, sampling, etc later on. */ - - switch (closure->id) { - case OSL::ClosureColor::MUL: { - OSL::ClosureMul *mul = (OSL::ClosureMul *)closure; - flatten_closure_tree(kg, sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight); - break; - } - case OSL::ClosureColor::ADD: { - OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure; - flatten_closure_tree(kg, sd, path_flag, add->closureA, weight); - flatten_closure_tree(kg, sd, path_flag, add->closureB, weight); - break; - } -#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \ - case OSL_CLOSURE_##Upper##_ID: { \ - const OSL::ClosureComponent *comp = reinterpret_cast<const OSL::ClosureComponent *>(closure); \ - weight *= TO_FLOAT3(comp->w); \ - osl_closure_##lower##_setup( \ - kg, sd, path_flag, weight, reinterpret_cast<const Upper##Closure *>(comp + 1)); \ - break; \ - } -#include "closures_template.h" - default: - break; - } -} - -/* Surface */ - -void OSLShader::eval_surface(const KernelGlobalsCPU *kg, - const void *state, - ShaderData *sd, - uint32_t path_flag) -{ - /* setup shader globals from shader data */ - OSLThreadData *tdata = kg->osl_tdata; - shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata); /* execute shader for this point */ OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss; @@ -175,101 +98,99 @@ void OSLShader::eval_surface(const KernelGlobalsCPU *kg, OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; - /* automatic bump shader */ - if (kg->osl->bump_state[shader]) { - /* save state */ - const float3 P = sd->P; - const float dP = sd->dP; - const OSL::Vec3 dPdx = globals->dPdx; - const OSL::Vec3 dPdy = globals->dPdy; - - /* set state as if undisplaced */ - if (sd->flag & SD_HAS_DISPLACEMENT) { - float data[9]; - bool found = kg->osl->services->get_attribute(sd, - true, - OSLRenderServices::u_empty, - TypeDesc::TypeVector, - OSLRenderServices::u_geom_undisplaced, - data); - (void)found; - assert(found); - - differential3 tmp_dP; - memcpy(&sd->P, data, sizeof(float) * 3); - memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3); - memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3); - - object_position_transform(kg, sd, &sd->P); - object_dir_transform(kg, sd, &tmp_dP.dx); - object_dir_transform(kg, sd, &tmp_dP.dy); - - sd->dP = differential_make_compact(tmp_dP); - - globals->P = TO_VEC3(sd->P); - globals->dPdx = TO_VEC3(tmp_dP.dx); - globals->dPdy = TO_VEC3(tmp_dP.dy); + if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) { + /* background */ + if (kg->osl->background_state) { + ss->execute(octx, *(kg->osl->background_state), *globals); } - - /* execute bump shader */ - ss->execute(octx, *(kg->osl->bump_state[shader]), *globals); - - /* reset state */ - sd->P = P; - sd->dP = dP; - - globals->P = TO_VEC3(P); - globals->dPdx = TO_VEC3(dPdx); - globals->dPdy = TO_VEC3(dPdy); } + else { + /* automatic bump shader */ + if (kg->osl->bump_state[shader]) { + /* save state */ + const float3 P = sd->P; + const float dP = sd->dP; + const OSL::Vec3 dPdx = globals->dPdx; + const OSL::Vec3 dPdy = globals->dPdy; + + /* set state as if undisplaced */ + if (sd->flag & SD_HAS_DISPLACEMENT) { + float data[9]; + bool found = kg->osl->services->get_attribute(sd, + true, + OSLRenderServices::u_empty, + TypeDesc::TypeVector, + OSLRenderServices::u_geom_undisplaced, + data); + (void)found; + assert(found); + + differential3 tmp_dP; + memcpy(&sd->P, data, sizeof(float) * 3); + memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3); + memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3); + + object_position_transform(kg, sd, &sd->P); + object_dir_transform(kg, sd, &tmp_dP.dx); + object_dir_transform(kg, sd, &tmp_dP.dy); + + sd->dP = differential_make_compact(tmp_dP); + + globals->P = TO_VEC3(sd->P); + globals->dPdx = TO_VEC3(tmp_dP.dx); + globals->dPdy = TO_VEC3(tmp_dP.dy); + } + + /* execute bump shader */ + ss->execute(octx, *(kg->osl->bump_state[shader]), *globals); + + /* reset state */ + sd->P = P; + sd->dP = dP; + + globals->P = TO_VEC3(P); + globals->dPdx = TO_VEC3(dPdx); + globals->dPdy = TO_VEC3(dPdy); + } - /* surface shader */ - if (kg->osl->surface_state[shader]) { - ss->execute(octx, *(kg->osl->surface_state[shader]), *globals); + /* surface shader */ + if (kg->osl->surface_state[shader]) { + ss->execute(octx, *(kg->osl->surface_state[shader]), *globals); + } } /* flatten closure tree */ if (globals->Ci) { - flatten_closure_tree(kg, sd, path_flag, globals->Ci); + flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci)); } } -/* Background */ +/* Volume */ -void OSLShader::eval_background(const KernelGlobalsCPU *kg, - const void *state, - ShaderData *sd, - uint32_t path_flag) +template<> +void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg, + const void *state, + ShaderData *sd, + uint32_t path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; - shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata); + shaderdata_to_shaderglobals( + kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals)); - /* execute shader for this point */ - OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss; - OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context; + /* clear trace data */ + tdata->tracedata.init = false; - if (kg->osl->background_state) { - ss->execute(octx, *(kg->osl->background_state), *globals); + /* Used by render-services. */ + sd->osl_globals = kg; + if (path_flag & PATH_RAY_SHADOW) { + sd->osl_path_state = nullptr; + sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state; } - - /* return background color immediately */ - if (globals->Ci) { - flatten_closure_tree(kg, sd, path_flag, globals->Ci); + else { + sd->osl_path_state = (const IntegratorStateCPU *)state; + sd->osl_shadow_path_state = nullptr; } -} - -/* Volume */ - -void OSLShader::eval_volume(const KernelGlobalsCPU *kg, - const void *state, - ShaderData *sd, - uint32_t path_flag) -{ - /* setup shader globals from shader data */ - OSLThreadData *tdata = kg->osl_tdata; - shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata); /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss; @@ -283,17 +204,30 @@ void OSLShader::eval_volume(const KernelGlobalsCPU *kg, /* flatten closure tree */ if (globals->Ci) { - flatten_closure_tree(kg, sd, path_flag, globals->Ci); + flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci)); } } /* Displacement */ -void OSLShader::eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd) +template<> +void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const KernelGlobalsCPU *kg, + const void *state, + ShaderData *sd, + uint32_t path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; - shaderdata_to_shaderglobals(kg, sd, state, 0, tdata); + shaderdata_to_shaderglobals( + kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals)); + + /* clear trace data */ + tdata->tracedata.init = false; + + /* Used by render-services. */ + sd->osl_globals = kg; + sd->osl_path_state = (const IntegratorStateCPU *)state; + sd->osl_shadow_path_state = nullptr; /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss; diff --git a/intern/cycles/kernel/osl/closures_setup.h b/intern/cycles/kernel/osl/closures_setup.h index 96c551b9951..ceaf56ccba6 100644 --- a/intern/cycles/kernel/osl/closures_setup.h +++ b/intern/cycles/kernel/osl/closures_setup.h @@ -40,12 +40,7 @@ CCL_NAMESPACE_BEGIN const char *label; #define OSL_CLOSURE_STRUCT_END(Upper, lower) \ } \ - ; \ - ccl_device void osl_closure_##lower##_setup(KernelGlobals kg, \ - ccl_private ShaderData *sd, \ - uint32_t path_flag, \ - float3 weight, \ - ccl_private Upper##Closure *closure); + ; #define OSL_CLOSURE_STRUCT_MEMBER(Upper, TYPE, type, name, key) type name; #define OSL_CLOSURE_STRUCT_ARRAY_MEMBER(Upper, TYPE, type, name, key, size) type name[size]; @@ -210,11 +205,9 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg, bsdf->ior = closure->ior; bsdf->T = closure->T; - static OSL::ustring u_ggx("ggx"); - static OSL::ustring u_default("default"); - /* GGX */ - if (closure->distribution == u_ggx || closure->distribution == u_default) { + if (closure->distribution == make_string("ggx", 11253504724482777663ull) || + closure->distribution == make_string("default", 4430693559278735917ull)) { if (!closure->refract) { if (closure->alpha_x == closure->alpha_y) { /* Isotropic */ @@ -1000,18 +993,14 @@ ccl_device void osl_closure_bssrdf_setup(KernelGlobals kg, float3 weight, ccl_private const BSSRDFClosure *closure) { - static ustring u_burley("burley"); - static ustring u_random_walk_fixed_radius("random_walk_fixed_radius"); - static ustring u_random_walk("random_walk"); - ClosureType type; - if (closure->method == u_burley) { + if (closure->method == make_string("burley", 186330084368958868ull)) { type = CLOSURE_BSSRDF_BURLEY_ID; } - else if (closure->method == u_random_walk_fixed_radius) { + else if (closure->method == make_string("random_walk_fixed_radius", 5695810351010063150ull)) { type = CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID; } - else if (closure->method == u_random_walk) { + else if (closure->method == make_string("random_walk", 11360609267673527222ull)) { type = CLOSURE_BSSRDF_RANDOM_WALK_ID; } else { diff --git a/intern/cycles/kernel/osl/closures_template.h b/intern/cycles/kernel/osl/closures_template.h index c808b275966..b9e9b52dcf8 100644 --- a/intern/cycles/kernel/osl/closures_template.h +++ b/intern/cycles/kernel/osl/closures_template.h @@ -40,7 +40,7 @@ OSL_CLOSURE_STRUCT_BEGIN(Transparent, transparent) OSL_CLOSURE_STRUCT_END(Transparent, transparent) OSL_CLOSURE_STRUCT_BEGIN(Microfacet, microfacet) - OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, ustring, distribution, NULL) + OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, DeviceString, distribution, NULL) OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, N, NULL) OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, T, NULL) OSL_CLOSURE_STRUCT_MEMBER(Microfacet, FLOAT, float, alpha_x, NULL) @@ -210,7 +210,7 @@ OSL_CLOSURE_STRUCT_BEGIN(PhongRamp, phong_ramp) OSL_CLOSURE_STRUCT_END(PhongRamp, phong_ramp) OSL_CLOSURE_STRUCT_BEGIN(BSSRDF, bssrdf) - OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, ustring, method, NULL) + OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, DeviceString, method, NULL) OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, N, NULL) OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, radius, NULL) OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, albedo, NULL) diff --git a/intern/cycles/kernel/osl/osl.h b/intern/cycles/kernel/osl/osl.h index bef23f3eea1..cc5c81ad027 100644 --- a/intern/cycles/kernel/osl/osl.h +++ b/intern/cycles/kernel/osl/osl.h @@ -1,38 +1,171 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2022 Blender Foundation */ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Adapted from Open Shading Language + * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al. + * All Rights Reserved. + * + * Modifications Copyright 2011-2022 Blender Foundation. */ #pragma once /* OSL Shader Engine * - * Holds all variables to execute and use OSL shaders from the kernel. These - * are initialized externally by OSLShaderManager before rendering starts. - * - * Before/after a thread starts rendering, thread_init/thread_free must be - * called, which will store any per thread OSL state in thread local storage. - * This means no thread state must be passed along in the kernel itself. + * Holds all variables to execute and use OSL shaders from the kernel. */ #include "kernel/osl/types.h" +#include "kernel/osl/closures_setup.h" + CCL_NAMESPACE_BEGIN -class OSLShader { - public: - /* eval */ - static void eval_surface(const KernelGlobalsCPU *kg, - const void *state, - ShaderData *sd, - uint32_t path_flag); - static void eval_background(const KernelGlobalsCPU *kg, - const void *state, - ShaderData *sd, - uint32_t path_flag); - static void eval_volume(const KernelGlobalsCPU *kg, - const void *state, - ShaderData *sd, - uint32_t path_flag); - static void eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd); -}; +ccl_device_inline void shaderdata_to_shaderglobals(KernelGlobals kg, + ccl_private ShaderData *sd, + uint32_t path_flag, + ccl_private ShaderGlobals *globals) +{ + const differential3 dP = differential_from_compact(sd->Ng, sd->dP); + const differential3 dI = differential_from_compact(sd->I, sd->dI); + + /* copy from shader data to shader globals */ + globals->P = sd->P; + globals->dPdx = dP.dx; + globals->dPdy = dP.dy; + globals->I = sd->I; + globals->dIdx = dI.dx; + globals->dIdy = dI.dy; + globals->N = sd->N; + globals->Ng = sd->Ng; + globals->u = sd->u; + globals->dudx = sd->du.dx; + globals->dudy = sd->du.dy; + globals->v = sd->v; + globals->dvdx = sd->dv.dx; + globals->dvdy = sd->dv.dy; + globals->dPdu = sd->dPdu; + globals->dPdv = sd->dPdv; + globals->time = sd->time; + globals->dtime = 1.0f; + globals->surfacearea = 1.0f; + globals->raytype = path_flag; + globals->flipHandedness = 0; + globals->backfacing = (sd->flag & SD_BACKFACING); + + /* shader data to be used in services callbacks */ + globals->renderstate = sd; + + /* hacky, we leave it to services to fetch actual object matrix */ + globals->shader2common = sd; + globals->object2common = sd; + + /* must be set to NULL before execute */ + globals->Ci = nullptr; +} + +ccl_device void flatten_closure_tree(KernelGlobals kg, + ccl_private ShaderData *sd, + uint32_t path_flag, + ccl_private const OSLClosure *closure) +{ + int stack_size = 0; + float3 weight = one_float3(); + float3 weight_stack[16]; + ccl_private const OSLClosure *closure_stack[16]; + + while (closure) { + switch (closure->id) { + case OSL_CLOSURE_MUL_ID: { + ccl_private const OSLClosureMul *mul = static_cast<ccl_private const OSLClosureMul *>( + closure); + weight *= mul->weight; + closure = mul->closure; + continue; + } + case OSL_CLOSURE_ADD_ID: { + if (stack_size >= 16) { + kernel_assert(!"Exhausted OSL closure stack"); + break; + } + ccl_private const OSLClosureAdd *add = static_cast<ccl_private const OSLClosureAdd *>( + closure); + closure = add->closureA; + weight_stack[stack_size] = weight; + closure_stack[stack_size++] = add->closureB; + continue; + } +#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \ + case OSL_CLOSURE_##Upper##_ID: { \ + ccl_private const OSLClosureComponent *comp = \ + static_cast<ccl_private const OSLClosureComponent *>(closure); \ + osl_closure_##lower##_setup(kg, \ + sd, \ + path_flag, \ + weight * comp->weight, \ + reinterpret_cast<ccl_private const Upper##Closure *>(comp + 1)); \ + break; \ + } +#include "closures_template.h" + default: + break; + } + + if (stack_size > 0) { + weight = weight_stack[--stack_size]; + closure = closure_stack[stack_size]; + } + else { + closure = nullptr; + } + } +} + +#ifndef __KERNEL_GPU__ + +template<ShaderType type> +void osl_eval_nodes(const KernelGlobalsCPU *kg, + const void *state, + ShaderData *sd, + uint32_t path_flag); + +#else + +template<ShaderType type, typename ConstIntegratorGenericState> +ccl_device_inline void osl_eval_nodes(KernelGlobals kg, + ConstIntegratorGenericState state, + ccl_private ShaderData *sd, + uint32_t path_flag) +{ + ShaderGlobals globals; + shaderdata_to_shaderglobals(kg, sd, path_flag, &globals); + + const int shader = sd->shader & SHADER_MASK; + +# ifdef __KERNEL_OPTIX__ + uint8_t group_data[2048]; + uint8_t closure_pool[1024]; + sd->osl_closure_pool = closure_pool; + + unsigned int optix_dc_index = 2 /* NUM_CALLABLE_PROGRAM_GROUPS */ + + (shader + type * kernel_data.max_shaders) * 2; + optixDirectCall<void>(optix_dc_index + 0, + /* shaderglobals_ptr = */ &globals, + /* groupdata_ptr = */ (void *)group_data, + /* userdata_base_ptr = */ (void *)nullptr, + /* output_base_ptr = */ (void *)nullptr, + /* shadeindex = */ 0); + optixDirectCall<void>(optix_dc_index + 1, + /* shaderglobals_ptr = */ &globals, + /* groupdata_ptr = */ (void *)group_data, + /* userdata_base_ptr = */ (void *)nullptr, + /* output_base_ptr = */ (void *)nullptr, + /* shadeindex = */ 0); +# endif + + if (globals.Ci) { + flatten_closure_tree(kg, sd, path_flag, globals.Ci); + } +} + +#endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp index b744422ee78..3fd098de4bb 100644 --- a/intern/cycles/kernel/osl/services.cpp +++ b/intern/cycles/kernel/osl/services.cpp @@ -119,8 +119,8 @@ ustring OSLRenderServices::u_u("u"); ustring OSLRenderServices::u_v("v"); ustring OSLRenderServices::u_empty; -OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system) - : OSL::RendererServices(texture_system) +OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system, int device_type) + : OSL::RendererServices(texture_system), device_type_(device_type) { } @@ -131,6 +131,17 @@ OSLRenderServices::~OSLRenderServices() } } +int OSLRenderServices::supports(string_view feature) const +{ +#ifdef WITH_OPTIX + if (feature == "OptiX") { + return device_type_ == DEVICE_OPTIX; + } +#endif + + return false; +} + bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, @@ -1139,29 +1150,40 @@ TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring file { OSLTextureHandleMap::iterator it = textures.find(filename); - /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */ - if (it != textures.end()) { - if (it->second->type != OSLTextureHandle::OIIO) { - return (TextureSystem::TextureHandle *)it->second.get(); + if (device_type_ == DEVICE_CPU) { + /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */ + if (it != textures.end()) { + if (it->second->type != OSLTextureHandle::OIIO) { + return (TextureSystem::TextureHandle *)it->second.get(); + } } - } - /* Get handle from OpenImageIO. */ - OSL::TextureSystem *ts = m_texturesys; - TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename); - if (handle == NULL) { - return NULL; - } + /* Get handle from OpenImageIO. */ + OSL::TextureSystem *ts = m_texturesys; + TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename); + if (handle == NULL) { + return NULL; + } + + /* Insert new OSLTextureHandle if needed. */ + if (it == textures.end()) { + textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO)); + it = textures.find(filename); + } - /* Insert new OSLTextureHandle if needed. */ - if (it == textures.end()) { - textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO)); - it = textures.find(filename); + /* Assign OIIO texture handle and return. */ + it->second->oiio_handle = handle; + return (TextureSystem::TextureHandle *)it->second.get(); } + else { + if (it != textures.end() && it->second->type == OSLTextureHandle::SVM && + it->second->svm_slots[0].w == -1) { + return reinterpret_cast<TextureSystem::TextureHandle *>( + static_cast<uintptr_t>(it->second->svm_slots[0].y + 1)); + } - /* Assign OIIO texture handle and return. */ - it->second->oiio_handle = handle; - return (TextureSystem::TextureHandle *)it->second.get(); + return NULL; + } } bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle) diff --git a/intern/cycles/kernel/osl/services.h b/intern/cycles/kernel/osl/services.h index 334b6682e34..9d875ae8e94 100644 --- a/intern/cycles/kernel/osl/services.h +++ b/intern/cycles/kernel/osl/services.h @@ -22,11 +22,8 @@ class PtexCache; CCL_NAMESPACE_BEGIN -class Object; class Scene; -class Shader; struct ShaderData; -struct float3; struct KernelGlobalsCPU; /* OSL Texture Handle @@ -73,11 +70,13 @@ typedef OIIO::unordered_map_concurrent<ustring, OSLTextureHandleRef, ustringHash class OSLRenderServices : public OSL::RendererServices { public: - OSLRenderServices(OSL::TextureSystem *texture_system); + OSLRenderServices(OSL::TextureSystem *texture_system, int device_type); ~OSLRenderServices(); static void register_closures(OSL::ShadingSystem *ss); + int supports(string_view feature) const override; + bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, @@ -324,6 +323,9 @@ class OSLRenderServices : public OSL::RendererServices { * and is required because texture handles are cached as part of the shared * shading system. */ OSLTextureHandleMap textures; + + private: + int device_type_; }; CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/services_gpu.h b/intern/cycles/kernel/osl/services_gpu.h new file mode 100644 index 00000000000..e6e19b8c484 --- /dev/null +++ b/intern/cycles/kernel/osl/services_gpu.h @@ -0,0 +1,2149 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Adapted from Open Shading Language + * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al. + * All Rights Reserved. + * + * Modifications Copyright 2011-2022 Blender Foundation. */ + +#include "kernel/tables.h" +#include "kernel/util/differential.h" + +#include "kernel/osl/osl.h" + +namespace DeviceStrings { + +/* "" */ +ccl_device_constant DeviceString _emptystring_ = {0ull}; +/* "NDC" */ +ccl_device_constant DeviceString u_ndc = {5148305047403260775ull}; +/* "screen" */ +ccl_device_constant DeviceString u_screen = {14159088609039777114ull}; +/* "camera" */ +ccl_device_constant DeviceString u_camera = {2159505832145726196ull}; +/* "raster" */ +ccl_device_constant DeviceString u_raster = {7759263238610201778ull}; +/* "world" */ +ccl_device_constant DeviceString u_world = {16436542438370751598ull}; +/* "common" */ +ccl_device_constant DeviceString u_common = {14645198576927606093ull}; +/* "hsv" */ +ccl_device_constant DeviceString u_hsv = {2177035556331879497ull}; +/* "hsl" */ +ccl_device_constant DeviceString u_hsl = {7749766809258288148ull}; +/* "XYZ" */ +ccl_device_constant DeviceString u_xyz = {4957977063494975483ull}; +/* "xyY" */ +ccl_device_constant DeviceString u_xyy = {5138822319725660255ull}; +/* "sRGB" */ +ccl_device_constant DeviceString u_srgb = {15368599878474175032ull}; +/* "object:location" */ +ccl_device_constant DeviceString u_object_location = {7846190347358762897ull}; +/* "object:color" */ +ccl_device_constant DeviceString u_object_color = {12695623857059169556ull}; +/* "object:alpha" */ +ccl_device_constant DeviceString u_object_alpha = {11165053919428293151ull}; +/* "object:index" */ +ccl_device_constant DeviceString u_object_index = {6588325838217472556ull}; +/* "geom:dupli_generated" */ +ccl_device_constant DeviceString u_geom_dupli_generated = {6715607178003388908ull}; +/* "geom:dupli_uv" */ +ccl_device_constant DeviceString u_geom_dupli_uv = {1294253317490155849ull}; +/* "material:index" */ +ccl_device_constant DeviceString u_material_index = {741770758159634623ull}; +/* "object:random" */ +ccl_device_constant DeviceString u_object_random = {15789063994977955884ull}; +/* "particle:index" */ +ccl_device_constant DeviceString u_particle_index = {9489711748229903784ull}; +/* "particle:random" */ +ccl_device_constant DeviceString u_particle_random = {17993722202766855761ull}; +/* "particle:age" */ +ccl_device_constant DeviceString u_particle_age = {7380730644710951109ull}; +/* "particle:lifetime" */ +ccl_device_constant DeviceString u_particle_lifetime = {16576828923156200061ull}; +/* "particle:location" */ +ccl_device_constant DeviceString u_particle_location = {10309536211423573010ull}; +/* "particle:rotation" */ +ccl_device_constant DeviceString u_particle_rotation = {17858543768041168459ull}; +/* "particle:size" */ +ccl_device_constant DeviceString u_particle_size = {16461524249715420389ull}; +/* "particle:velocity" */ +ccl_device_constant DeviceString u_particle_velocity = {13199101248768308863ull}; +/* "particle:angular_velocity" */ +ccl_device_constant DeviceString u_particle_angular_velocity = {16327930120486517910ull}; +/* "geom:numpolyvertices" */ +ccl_device_constant DeviceString u_geom_numpolyvertices = {382043551489988826ull}; +/* "geom:trianglevertices" */ +ccl_device_constant DeviceString u_geom_trianglevertices = {17839267571524187074ull}; +/* "geom:polyvertices" */ +ccl_device_constant DeviceString u_geom_polyvertices = {1345577201967881769ull}; +/* "geom:name" */ +ccl_device_constant DeviceString u_geom_name = {13606338128269760050ull}; +/* "geom:undisplaced" */ +ccl_device_constant DeviceString u_geom_undisplaced = {12431586303019276305ull}; +/* "geom:is_smooth" */ +ccl_device_constant DeviceString u_is_smooth = {857544214094480123ull}; +/* "geom:is_curve" */ +ccl_device_constant DeviceString u_is_curve = {129742495633653138ull}; +/* "geom:curve_thickness" */ +ccl_device_constant DeviceString u_curve_thickness = {10605802038397633852ull}; +/* "geom:curve_length" */ +ccl_device_constant DeviceString u_curve_length = {11423459517663715453ull}; +/* "geom:curve_tangent_normal" */ +ccl_device_constant DeviceString u_curve_tangent_normal = {12301397394034985633ull}; +/* "geom:curve_random" */ +ccl_device_constant DeviceString u_curve_random = {15293085049960492358ull}; +/* "geom:is_point" */ +ccl_device_constant DeviceString u_is_point = {2511357849436175953ull}; +/* "geom:point_radius" */ +ccl_device_constant DeviceString u_point_radius = {9956381140398668479ull}; +/* "geom:point_position" */ +ccl_device_constant DeviceString u_point_position = {15684484280742966916ull}; +/* "geom:point_random" */ +ccl_device_constant DeviceString u_point_random = {5632627207092325544ull}; +/* "geom:normal_map_normal" */ +ccl_device_constant DeviceString u_normal_map_normal = {10718948685686827073}; +/* "path:ray_length" */ +ccl_device_constant DeviceString u_path_ray_length = {16391985802412544524ull}; +/* "path:ray_depth" */ +ccl_device_constant DeviceString u_path_ray_depth = {16643933224879500399ull}; +/* "path:diffuse_depth" */ +ccl_device_constant DeviceString u_path_diffuse_depth = {13191651286699118408ull}; +/* "path:glossy_depth" */ +ccl_device_constant DeviceString u_path_glossy_depth = {15717768399057252940ull}; +/* "path:transparent_depth" */ +ccl_device_constant DeviceString u_path_transparent_depth = {7821650266475578543ull}; +/* "path:transmission_depth" */ +ccl_device_constant DeviceString u_path_transmission_depth = {15113408892323917624ull}; + +} // namespace DeviceStrings + +/* Closure */ + +ccl_device_extern ccl_private OSLClosure *osl_mul_closure_color(ccl_private ShaderGlobals *sg, + ccl_private OSLClosure *a, + ccl_private const float3 *weight) +{ + if (*weight == zero_float3() || !a) { + return nullptr; + } + else if (*weight == one_float3()) { + return a; + } + + ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate); + + ccl_private uint8_t *closure_pool = sd->osl_closure_pool; + /* Align pointer to closure struct requirement */ + closure_pool = reinterpret_cast<uint8_t *>( + (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureMul) - 1) & + (-alignof(OSLClosureMul))); + sd->osl_closure_pool = closure_pool + sizeof(OSLClosureMul); + + ccl_private OSLClosureMul *const closure = reinterpret_cast<ccl_private OSLClosureMul *>( + closure_pool); + closure->id = OSL_CLOSURE_MUL_ID; + closure->weight = *weight; + closure->closure = a; + + return closure; +} + +ccl_device_extern ccl_private OSLClosure *osl_mul_closure_float(ccl_private ShaderGlobals *sg, + ccl_private OSLClosure *a, + float weight) +{ + if (weight == 0.0f || !a) { + return nullptr; + } + else if (weight == 1.0f) { + return a; + } + + ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate); + + uint8_t *closure_pool = sd->osl_closure_pool; + /* Align pointer to closure struct requirement */ + closure_pool = reinterpret_cast<uint8_t *>( + (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureMul) - 1) & + (-alignof(OSLClosureMul))); + sd->osl_closure_pool = closure_pool + sizeof(OSLClosureMul); + + ccl_private OSLClosureMul *const closure = reinterpret_cast<ccl_private OSLClosureMul *>( + closure_pool); + closure->id = OSL_CLOSURE_MUL_ID; + closure->weight = make_float3(weight, weight, weight); + closure->closure = a; + + return closure; +} + +ccl_device_extern ccl_private OSLClosure *osl_add_closure_closure(ccl_private ShaderGlobals *sg, + ccl_private OSLClosure *a, + ccl_private OSLClosure *b) +{ + if (!a) { + return b; + } + if (!b) { + return a; + } + + ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate); + + ccl_private uint8_t *closure_pool = sd->osl_closure_pool; + /* Align pointer to closure struct requirement */ + closure_pool = reinterpret_cast<uint8_t *>( + (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureAdd) - 1) & + (-alignof(OSLClosureAdd))); + sd->osl_closure_pool = closure_pool + sizeof(OSLClosureAdd); + + ccl_private OSLClosureAdd *const closure = reinterpret_cast<ccl_private OSLClosureAdd *>( + closure_pool); + closure->id = OSL_CLOSURE_ADD_ID; + closure->closureA = a; + closure->closureB = b; + + return closure; +} + +ccl_device_extern ccl_private OSLClosure *osl_allocate_closure_component( + ccl_private ShaderGlobals *sg, int id, int size) +{ + ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate); + + ccl_private uint8_t *closure_pool = sd->osl_closure_pool; + /* Align pointer to closure struct requirement */ + closure_pool = reinterpret_cast<uint8_t *>( + (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureComponent) - 1) & + (-alignof(OSLClosureComponent))); + sd->osl_closure_pool = closure_pool + sizeof(OSLClosureComponent) + size; + + ccl_private OSLClosureComponent *const closure = + reinterpret_cast<ccl_private OSLClosureComponent *>(closure_pool); + closure->id = static_cast<OSLClosureType>(id); + closure->weight = one_float3(); + + return closure; +} + +ccl_device_extern ccl_private OSLClosure *osl_allocate_weighted_closure_component( + ccl_private ShaderGlobals *sg, int id, int size, ccl_private const float3 *weight) +{ + ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate); + + ccl_private uint8_t *closure_pool = sd->osl_closure_pool; + /* Align pointer to closure struct requirement */ + closure_pool = reinterpret_cast<uint8_t *>( + (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureComponent) - 1) & + (-alignof(OSLClosureComponent))); + sd->osl_closure_pool = closure_pool + sizeof(OSLClosureComponent) + size; + + ccl_private OSLClosureComponent *const closure = + reinterpret_cast<ccl_private OSLClosureComponent *>(closure_pool); + closure->id = static_cast<OSLClosureType>(id); + closure->weight = *weight; + + return closure; +} + +/* Utilities */ + +#include "kernel/svm/math_util.h" +#include "kernel/util/color.h" + +ccl_device_extern void osl_error(ccl_private ShaderGlobals *sg, const char *format, void *args) +{ +} + +ccl_device_extern void osl_printf(ccl_private ShaderGlobals *sg, const char *format, void *args) +{ +} + +ccl_device_extern void osl_warning(ccl_private ShaderGlobals *sg, const char *format, void *args) +{ +} + +ccl_device_extern uint osl_range_check(int indexvalue, + int length, + DeviceString symname, + ccl_private ShaderGlobals *sg, + DeviceString sourcefile, + int sourceline, + DeviceString groupname, + int layer, + DeviceString layername, + DeviceString shadername) +{ + const int result = indexvalue < 0 ? 0 : indexvalue >= length ? length - 1 : indexvalue; +#if 0 + if (result != indexvalue) { + printf("Index [%d] out of range\n", indexvalue); + } +#endif + return result; +} + +ccl_device_extern uint osl_range_check_err(int indexvalue, + int length, + DeviceString symname, + ccl_private ShaderGlobals *sg, + DeviceString sourcefile, + int sourceline, + DeviceString groupname, + int layer, + DeviceString layername, + DeviceString shadername) +{ + return osl_range_check(indexvalue, + length, + symname, + sg, + sourcefile, + sourceline, + groupname, + layer, + layername, + shadername); +} + +/* Color Utilities */ + +ccl_device_extern void osl_blackbody_vf(ccl_private ShaderGlobals *sg, + ccl_private float3 *result, + float temperature) +{ + float3 color_rgb = rec709_to_rgb(nullptr, svm_math_blackbody_color_rec709(temperature)); + color_rgb = max(color_rgb, zero_float3()); + *result = color_rgb; +} + +#if 0 +ccl_device_extern void osl_wavelength_color_vf(ccl_private ShaderGlobals *sg, + ccl_private float3 *result, + float wavelength) +{ +} +#endif + +ccl_device_extern void osl_luminance_fv(ccl_private ShaderGlobals *sg, + ccl_private float *result, + ccl_private float3 *color) +{ + *result = linear_rgb_to_gray(nullptr, *color); +} + +ccl_device_extern void osl_luminance_dfdv(ccl_private ShaderGlobals *sg, + ccl_private float *result, + ccl_private float3 *color) +{ + for (int i = 0; i < 3; ++i) { + osl_luminance_fv(sg, result + i, color + i); + } +} + +ccl_device_extern void osl_prepend_color_from(ccl_private ShaderGlobals *sg, + ccl_private float3 *res, + DeviceString from) +{ + if (from == DeviceStrings::u_hsv) { + *res = hsv_to_rgb(*res); + } + else if (from == DeviceStrings::u_hsl) { + *res = hsl_to_rgb(*res); + } + else if (from == DeviceStrings::u_xyz) { + *res = xyz_to_rgb(nullptr, *res); + } + else if (from == DeviceStrings::u_xyy) { + *res = xyz_to_rgb(nullptr, xyY_to_xyz(res->x, res->y, res->z)); + } +} + +ccl_device_extern bool osl_transformc(ccl_private ShaderGlobals *sg, + ccl_private float3 *c_in, + int c_in_derivs, + ccl_private float3 *c_out, + int c_out_derivs, + DeviceString from, + DeviceString to) +{ + if (!c_out_derivs) { + c_in_derivs = false; + } + else if (!c_in_derivs) { + c_out[1] = zero_float3(); + c_out[2] = zero_float3(); + } + + float3 rgb; + + for (int i = 0; i < (c_in_derivs ? 3 : 1); ++i) { + if (from == DeviceStrings::u_hsv) { + rgb = hsv_to_rgb(c_in[i]); + } + else if (from == DeviceStrings::u_hsl) { + rgb = hsl_to_rgb(c_in[i]); + } + else if (from == DeviceStrings::u_xyz) { + rgb = xyz_to_rgb(nullptr, c_in[i]); + } + else if (from == DeviceStrings::u_xyy) { + rgb = xyz_to_rgb(nullptr, xyY_to_xyz(c_in[i].x, c_in[i].y, c_in[i].z)); + } + else if (from == DeviceStrings::u_srgb) { + rgb = color_srgb_to_linear_v3(c_in[i]); + } + else { + rgb = c_in[i]; + } + + if (to == DeviceStrings::u_hsv) { + c_out[i] = rgb_to_hsv(rgb); + } + else if (to == DeviceStrings::u_hsl) { + c_out[i] = rgb_to_hsl(rgb); + } +#if 0 + else if (to == DeviceStrings::u_xyz) { + c_out[i] = rgb_to_xyz(nullptr, rgb); + } + else if (to == DeviceStrings::u_xyy) { + c_out[i] = xyz_to_xyY(rgb_to_xyz(nullptr, rgb)); + } +#endif + else if (to == DeviceStrings::u_srgb) { + c_out[i] = color_linear_to_srgb_v3(rgb); + } + else { + c_out[i] = rgb; + } + } +} + +/* Matrix Utilities */ + +#include "util/transform.h" + +ccl_device_forceinline void copy_matrix(ccl_private float *res, const Transform &tfm) +{ + res[0] = tfm.x.x; + res[1] = tfm.y.x; + res[2] = tfm.z.x; + res[3] = 0.0f; + res[4] = tfm.x.y; + res[5] = tfm.y.y; + res[6] = tfm.z.y; + res[7] = 0.0f; + res[8] = tfm.x.z; + res[9] = tfm.y.z; + res[10] = tfm.z.z; + res[11] = 0.0f; + res[12] = tfm.x.w; + res[13] = tfm.y.w; + res[14] = tfm.z.w; + res[15] = 1.0f; +} +ccl_device_forceinline void copy_matrix(ccl_private float *res, const ProjectionTransform &tfm) +{ + res[0] = tfm.x.x; + res[1] = tfm.y.x; + res[2] = tfm.z.x; + res[3] = tfm.w.x; + res[4] = tfm.x.y; + res[5] = tfm.y.y; + res[6] = tfm.z.y; + res[7] = tfm.w.y; + res[8] = tfm.x.z; + res[9] = tfm.y.z; + res[10] = tfm.z.z; + res[11] = tfm.w.z; + res[12] = tfm.x.w; + res[13] = tfm.y.w; + res[14] = tfm.z.w; + res[15] = tfm.w.w; +} +ccl_device_forceinline void copy_identity_matrix(ccl_private float *res) +{ + res[0] = 1.0f; + res[1] = 0.0f; + res[2] = 0.0f; + res[3] = 0.0f; + res[4] = 0.0f; + res[5] = 1.0f; + res[6] = 0.0f; + res[7] = 0.0f; + res[8] = 0.0f; + res[9] = 0.0f; + res[10] = 1.0f; + res[11] = 0.0f; + res[12] = 0.0f; + res[13] = 0.0f; + res[14] = 0.0f; + res[15] = 1.0f; +} +ccl_device_forceinline Transform convert_transform(ccl_private const float *m) +{ + return make_transform( + m[0], m[4], m[8], m[12], m[1], m[5], m[9], m[13], m[2], m[6], m[10], m[14]); +} + +ccl_device_extern void osl_mul_mmm(ccl_private float *res, + ccl_private const float *a, + ccl_private const float *b) +{ + const Transform tfm_a = convert_transform(a); + const Transform tfm_b = convert_transform(b); + copy_matrix(res, tfm_a * tfm_b); +} + +ccl_device_extern void osl_mul_mmf(ccl_private float *res, ccl_private const float *a, float b) +{ + for (int i = 0; i < 16; ++i) { + res[i] = a[i] * b; + } +} + +ccl_device_extern void osl_div_mmm(ccl_private float *res, + ccl_private const float *a, + ccl_private const float *b) +{ + const Transform tfm_a = convert_transform(a); + const Transform tfm_b = convert_transform(b); + copy_matrix(res, tfm_a * transform_inverse(tfm_b)); +} + +ccl_device_extern void osl_div_mmf(ccl_private float *res, ccl_private const float *a, float b) +{ + for (int i = 0; i < 16; ++i) { + res[i] = a[i] / b; + } +} + +ccl_device_extern void osl_div_mfm(ccl_private float *res, float a, ccl_private const float *b) +{ + const Transform tfm_b = convert_transform(b); + copy_matrix(res, transform_inverse(tfm_b)); + for (int i = 0; i < 16; ++i) { + res[i] *= a; + } +} + +ccl_device_extern void osl_div_m_ff(ccl_private float *res, float a, float b) +{ + float f = (b == 0) ? 0.0f : (a / b); + res[0] = f; + res[1] = 0.0f; + res[2] = 0.0f; + res[3] = 0.0f; + res[4] = 0.0f; + res[5] = f; + res[6] = 0.0f; + res[7] = 0.0f; + res[8] = 0.0f; + res[9] = 0.0f; + res[10] = f; + res[11] = 0.0f; + res[12] = 0.0f; + res[13] = 0.0f; + res[14] = 0.0f; + res[15] = f; +} + +ccl_device_extern void osl_transform_vmv(ccl_private float3 *res, + ccl_private const float *m, + ccl_private const float3 *v) +{ + const Transform tfm_m = convert_transform(m); + *res = transform_point(&tfm_m, *v); +} + +ccl_device_extern void osl_transform_dvmdv(ccl_private float3 *res, + ccl_private const float *m, + ccl_private const float3 *v) +{ + for (int i = 0; i < 3; ++i) { + const Transform tfm_m = convert_transform(m + i * 16); + res[i] = transform_point(&tfm_m, v[i]); + } +} + +ccl_device_extern void osl_transformv_vmv(ccl_private float3 *res, + ccl_private const float *m, + ccl_private const float3 *v) +{ + const Transform tfm_m = convert_transform(m); + *res = transform_direction(&tfm_m, *v); +} + +ccl_device_extern void osl_transformv_dvmdv(ccl_private float3 *res, + ccl_private const float *m, + ccl_private const float3 *v) +{ + for (int i = 0; i < 3; ++i) { + const Transform tfm_m = convert_transform(m + i * 16); + res[i] = transform_direction(&tfm_m, v[i]); + } +} + +ccl_device_extern void osl_transformn_vmv(ccl_private float3 *res, + ccl_private const float *m, + ccl_private const float3 *v) +{ + const Transform tfm_m = convert_transform(m); + *res = transform_direction(&tfm_m, *v); +} + +ccl_device_extern void osl_transformn_dvmdv(ccl_private float3 *res, + ccl_private const float *m, + ccl_private const float3 *v) +{ + for (int i = 0; i < 3; ++i) { + const Transform tfm_m = convert_transform(m + i * 16); + res[i] = transform_direction(&tfm_m, v[i]); + } +} + +ccl_device_extern bool osl_get_matrix(ccl_private ShaderGlobals *sg, + ccl_private float *result, + DeviceString from) +{ + if (from == DeviceStrings::u_ndc) { + copy_matrix(result, kernel_data.cam.ndctoworld); + return true; + } + if (from == DeviceStrings::u_raster) { + copy_matrix(result, kernel_data.cam.rastertoworld); + return true; + } + if (from == DeviceStrings::u_screen) { + copy_matrix(result, kernel_data.cam.screentoworld); + return true; + } + if (from == DeviceStrings::u_camera) { + copy_matrix(result, kernel_data.cam.cameratoworld); + return true; + } + if (from == DeviceStrings::u_world) { + copy_identity_matrix(result); + return true; + } + + return false; +} + +ccl_device_extern bool osl_get_inverse_matrix(ccl_private ShaderGlobals *sg, + ccl_private float *res, + DeviceString to) +{ + if (to == DeviceStrings::u_ndc) { + copy_matrix(res, kernel_data.cam.worldtondc); + return true; + } + if (to == DeviceStrings::u_raster) { + copy_matrix(res, kernel_data.cam.worldtoraster); + return true; + } + if (to == DeviceStrings::u_screen) { + copy_matrix(res, kernel_data.cam.worldtoscreen); + return true; + } + if (to == DeviceStrings::u_camera) { + copy_matrix(res, kernel_data.cam.worldtocamera); + return true; + } + if (to == DeviceStrings::u_world) { + copy_identity_matrix(res); + return true; + } + + return false; +} + +ccl_device_extern bool osl_get_from_to_matrix(ccl_private ShaderGlobals *sg, + ccl_private float *res, + DeviceString from, + DeviceString to) +{ + float m_from[16], m_to[16]; + if (osl_get_matrix(sg, m_from, from) && osl_get_inverse_matrix(sg, m_to, to)) { + osl_mul_mmm(res, m_from, m_to); + return true; + } + + return false; +} + +ccl_device_extern void osl_prepend_matrix_from(ccl_private ShaderGlobals *sg, + ccl_private float *res, + DeviceString from) +{ + float m[16]; + if (osl_get_matrix(sg, m, from)) { + osl_mul_mmm(res, m, res); + } +} + +ccl_device_extern bool osl_transform_triple(ccl_private ShaderGlobals *sg, + ccl_private float3 *p_in, + int p_in_derivs, + ccl_private float3 *p_out, + int p_out_derivs, + DeviceString from, + DeviceString to, + int vectype) +{ + if (!p_out_derivs) { + p_in_derivs = false; + } + else if (!p_in_derivs) { + p_out[1] = zero_float3(); + p_out[2] = zero_float3(); + } + + bool res; + float m[16]; + + if (from == DeviceStrings::u_common) { + res = osl_get_inverse_matrix(sg, m, to); + } + else if (to == DeviceStrings::u_common) { + res = osl_get_matrix(sg, m, from); + } + else { + res = osl_get_from_to_matrix(sg, m, from, to); + } + + if (res) { + if (vectype == 2 /* TypeDesc::POINT */) { + if (p_in_derivs) + osl_transform_dvmdv(p_out, m, p_in); + else + osl_transform_vmv(p_out, m, p_in); + } + else if (vectype == 3 /* TypeDesc::VECTOR */) { + if (p_in_derivs) + osl_transformv_dvmdv(p_out, m, p_in); + else + osl_transformv_vmv(p_out, m, p_in); + } + else if (vectype == 4 /* TypeDesc::NORMAL */) { + if (p_in_derivs) + osl_transformn_dvmdv(p_out, m, p_in); + else + osl_transformn_vmv(p_out, m, p_in); + } + else { + res = false; + } + } + else { + p_out[0] = p_in[0]; + if (p_in_derivs) { + p_out[1] = p_in[1]; + p_out[2] = p_in[2]; + } + } + + return res; +} + +ccl_device_extern bool osl_transform_triple_nonlinear(ccl_private ShaderGlobals *sg, + ccl_private float3 *p_in, + int p_in_derivs, + ccl_private float3 *p_out, + int p_out_derivs, + DeviceString from, + DeviceString to, + int vectype) +{ + return osl_transform_triple(sg, p_in, p_in_derivs, p_out, p_out_derivs, from, to, vectype); +} + +ccl_device_extern void osl_transpose_mm(ccl_private float *res, ccl_private const float *m) +{ + copy_matrix(res, *reinterpret_cast<ccl_private const ProjectionTransform *>(m)); +} + +#if 0 +ccl_device_extern float osl_determinant_fm(ccl_private const float *m) +{ +} +#endif + +/* Attributes */ + +#include "kernel/geom/geom.h" + +typedef long long TypeDesc; + +ccl_device_inline bool set_attribute_float(ccl_private float fval[3], + TypeDesc type, + bool derivatives, + ccl_private void *val) +{ + const unsigned char type_basetype = type & 0xF; + const unsigned char type_aggregate = (type >> 8) & 0xF; + const int type_arraylen = type >> 32; + + if (type_basetype == 11 /* TypeDesc::FLOAT */) { + if ((type_aggregate == 2 /* TypeDesc::VEC2 */) || + (type_aggregate == 1 && type_arraylen == 2)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i * 2 + 0] = fval[i]; + static_cast<ccl_private float *>(val)[i * 2 + 1] = fval[i]; + } + return true; + } + if ((type_aggregate == 3 /* TypeDesc::VEC3 */) || + (type_aggregate == 1 && type_arraylen == 3)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i]; + static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i]; + static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i]; + } + return true; + } + if ((type_aggregate == 4 /* TypeDesc::VEC4 */) || + (type_aggregate == 1 && type_arraylen == 4)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i]; + static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i]; + static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i]; + static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f; + } + return true; + } + if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i] = fval[i]; + } + return true; + } + } + + return false; +} +ccl_device_inline bool set_attribute_float(float f, + TypeDesc type, + bool derivatives, + ccl_private void *val) +{ + float fv[3]; + + fv[0] = f; + fv[1] = 0.0f; + fv[2] = 0.0f; + + return set_attribute_float(fv, type, derivatives, val); +} +ccl_device_inline bool set_attribute_float2(ccl_private float2 fval[3], + TypeDesc type, + bool derivatives, + ccl_private void *val) +{ + const unsigned char type_basetype = type & 0xF; + const unsigned char type_aggregate = (type >> 8) & 0xF; + const int type_arraylen = type >> 32; + + if (type_basetype == 11 /* TypeDesc::FLOAT */) { + if ((type_aggregate == 2 /* TypeDesc::VEC2 */) || + (type_aggregate == 1 && type_arraylen == 2)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i * 2 + 0] = fval[i].x; + static_cast<ccl_private float *>(val)[i * 2 + 1] = fval[i].y; + } + return true; + } + if ((type_aggregate == 3 /* TypeDesc::VEC3 */) || + (type_aggregate == 1 && type_arraylen == 3)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x; + static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y; + static_cast<ccl_private float *>(val)[i * 3 + 2] = 0.0f; + } + return true; + } + if ((type_aggregate == 4 /* TypeDesc::VEC4 */) || + (type_aggregate == 1 && type_arraylen == 4)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x; + static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y; + static_cast<ccl_private float *>(val)[i * 4 + 2] = 0.0f; + static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f; + } + return true; + } + if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i] = fval[i].x; + } + return true; + } + } + + return false; +} +ccl_device_inline bool set_attribute_float3(ccl_private float3 fval[3], + TypeDesc type, + bool derivatives, + ccl_private void *val) +{ + const unsigned char type_basetype = type & 0xF; + const unsigned char type_aggregate = (type >> 8) & 0xF; + const int type_arraylen = type >> 32; + + if (type_basetype == 11 /* TypeDesc::FLOAT */) { + if ((type_aggregate == 3 /* TypeDesc::VEC3 */) || + (type_aggregate == 1 && type_arraylen == 3)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x; + static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y; + static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i].z; + } + return true; + } + if ((type_aggregate == 4 /* TypeDesc::VEC4 */) || + (type_aggregate == 1 && type_arraylen == 4)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x; + static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y; + static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i].z; + static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f; + } + return true; + } + if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i] = average(fval[i]); + } + return true; + } + } + + return false; +} +ccl_device_inline bool set_attribute_float3(float3 f, + TypeDesc type, + bool derivatives, + ccl_private void *val) +{ + float3 fv[3]; + + fv[0] = f; + fv[1] = make_float3(0.0f, 0.0f, 0.0f); + fv[2] = make_float3(0.0f, 0.0f, 0.0f); + + return set_attribute_float3(fv, type, derivatives, val); +} +ccl_device_inline bool set_attribute_float4(ccl_private float4 fval[3], + TypeDesc type, + bool derivatives, + ccl_private void *val) +{ + const unsigned char type_basetype = type & 0xF; + const unsigned char type_aggregate = (type >> 8) & 0xF; + const int type_arraylen = type >> 32; + + if (type_basetype == 11 /* TypeDesc::FLOAT */) { + if ((type_aggregate == 3 /* TypeDesc::VEC3 */) || + (type_aggregate == 1 && type_arraylen == 3)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x; + static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y; + static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i].z; + } + return true; + } + if ((type_aggregate == 4 /* TypeDesc::VEC4 */) || + (type_aggregate == 1 && type_arraylen == 4)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x; + static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y; + static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i].z; + static_cast<ccl_private float *>(val)[i * 4 + 3] = fval[i].w; + } + return true; + } + if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) { + for (int i = 0; i < (derivatives ? 3 : 1); ++i) { + static_cast<ccl_private float *>(val)[i] = average(float4_to_float3(fval[i])); + } + return true; + } + } + + return false; +} +ccl_device_inline bool set_attribute_matrix(ccl_private const Transform &tfm, + TypeDesc type, + ccl_private void *val) +{ + const unsigned char type_basetype = type & 0xF; + const unsigned char type_aggregate = (type >> 8) & 0xF; + + if (type_basetype == 11 /* TypeDesc::FLOAT */ && type_aggregate == 16 /* TypeDesc::MATRIX44 */) { + copy_matrix(static_cast<ccl_private float *>(val), tfm); + return true; + } + + return false; +} + +ccl_device_inline bool get_background_attribute(KernelGlobals kg, + ccl_private ShaderData *sd, + DeviceString name, + TypeDesc type, + bool derivatives, + ccl_private void *val) +{ + if (name == DeviceStrings::u_path_ray_length) { + /* Ray Length */ + float f = sd->ray_length; + return set_attribute_float(f, type, derivatives, val); + } + + return false; +} + +ccl_device_inline bool get_object_attribute(KernelGlobals kg, + ccl_private ShaderData *sd, + const AttributeDescriptor &desc, + TypeDesc type, + bool derivatives, + ccl_private void *val) +{ + if (desc.type == NODE_ATTR_FLOAT) { + float fval[3]; +#ifdef __VOLUME__ + if (primitive_is_volume_attribute(sd, desc)) + fval[0] = primitive_volume_attribute_float(kg, sd, desc); + else +#endif + fval[0] = primitive_surface_attribute_float( + kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr); + return set_attribute_float(fval, type, derivatives, val); + } + else if (desc.type == NODE_ATTR_FLOAT2) { + float2 fval[3]; +#ifdef __VOLUME__ + if (primitive_is_volume_attribute(sd, desc)) + return false; + else +#endif + fval[0] = primitive_surface_attribute_float2( + kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr); + return set_attribute_float2(fval, type, derivatives, val); + } + else if (desc.type == NODE_ATTR_FLOAT3) { + float3 fval[3]; +#ifdef __VOLUME__ + if (primitive_is_volume_attribute(sd, desc)) + fval[0] = primitive_volume_attribute_float3(kg, sd, desc); + else +#endif + fval[0] = primitive_surface_attribute_float3( + kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr); + return set_attribute_float3(fval, type, derivatives, val); + } + else if (desc.type == NODE_ATTR_FLOAT4 || desc.type == NODE_ATTR_RGBA) { + float4 fval[3]; +#ifdef __VOLUME__ + if (primitive_is_volume_attribute(sd, desc)) + fval[0] = primitive_volume_attribute_float4(kg, sd, desc); + else +#endif + fval[0] = primitive_surface_attribute_float4( + kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr); + return set_attribute_float4(fval, type, derivatives, val); + } + else if (desc.type == NODE_ATTR_MATRIX) { + Transform tfm = primitive_attribute_matrix(kg, desc); + return set_attribute_matrix(tfm, type, val); + } + + return false; +} + +ccl_device_inline bool get_object_standard_attribute(KernelGlobals kg, + ccl_private ShaderData *sd, + DeviceString name, + TypeDesc type, + bool derivatives, + ccl_private void *val) +{ + /* Object attributes */ + if (name == DeviceStrings::u_object_location) { + float3 f = object_location(kg, sd); + return set_attribute_float3(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_object_color) { + float3 f = object_color(kg, sd->object); + return set_attribute_float3(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_object_alpha) { + float f = object_alpha(kg, sd->object); + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_object_index) { + float f = object_pass_id(kg, sd->object); + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_geom_dupli_generated) { + float3 f = object_dupli_generated(kg, sd->object); + return set_attribute_float3(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_geom_dupli_uv) { + float3 f = object_dupli_uv(kg, sd->object); + return set_attribute_float3(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_material_index) { + float f = shader_pass_id(kg, sd); + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_object_random) { + float f = object_random_number(kg, sd->object); + return set_attribute_float(f, type, derivatives, val); + } + + /* Particle attributes */ + else if (name == DeviceStrings::u_particle_index) { + int particle_id = object_particle_id(kg, sd->object); + float f = particle_index(kg, particle_id); + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_particle_random) { + int particle_id = object_particle_id(kg, sd->object); + float f = hash_uint2_to_float(particle_index(kg, particle_id), 0); + return set_attribute_float(f, type, derivatives, val); + } + + else if (name == DeviceStrings::u_particle_age) { + int particle_id = object_particle_id(kg, sd->object); + float f = particle_age(kg, particle_id); + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_particle_lifetime) { + int particle_id = object_particle_id(kg, sd->object); + float f = particle_lifetime(kg, particle_id); + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_particle_location) { + int particle_id = object_particle_id(kg, sd->object); + float3 f = particle_location(kg, particle_id); + return set_attribute_float3(f, type, derivatives, val); + } +#if 0 /* unsupported */ + else if (name == DeviceStrings::u_particle_rotation) { + int particle_id = object_particle_id(kg, sd->object); + float4 f = particle_rotation(kg, particle_id); + return set_attribute_float4(f, type, derivatives, val); + } +#endif + else if (name == DeviceStrings::u_particle_size) { + int particle_id = object_particle_id(kg, sd->object); + float f = particle_size(kg, particle_id); + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_particle_velocity) { + int particle_id = object_particle_id(kg, sd->object); + float3 f = particle_velocity(kg, particle_id); + return set_attribute_float3(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_particle_angular_velocity) { + int particle_id = object_particle_id(kg, sd->object); + float3 f = particle_angular_velocity(kg, particle_id); + return set_attribute_float3(f, type, derivatives, val); + } + + /* Geometry attributes */ +#if 0 /* TODO */ + else if (name == DeviceStrings::u_geom_numpolyvertices) { + return false; + } + else if (name == DeviceStrings::u_geom_trianglevertices || + name == DeviceStrings::u_geom_polyvertices) { + return false; + } + else if (name == DeviceStrings::u_geom_name) { + return false; + } +#endif + else if (name == DeviceStrings::u_is_smooth) { + float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0); + return set_attribute_float(f, type, derivatives, val); + } + +#ifdef __HAIR__ + /* Hair attributes */ + else if (name == DeviceStrings::u_is_curve) { + float f = (sd->type & PRIMITIVE_CURVE) != 0; + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_curve_thickness) { + float f = curve_thickness(kg, sd); + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_curve_tangent_normal) { + float3 f = curve_tangent_normal(kg, sd); + return set_attribute_float3(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_curve_random) { + float f = curve_random(kg, sd); + return set_attribute_float(f, type, derivatives, val); + } +#endif + +#ifdef __POINTCLOUD__ + /* Point attributes */ + else if (name == DeviceStrings::u_is_point) { + float f = (sd->type & PRIMITIVE_POINT) != 0; + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_point_radius) { + float f = point_radius(kg, sd); + return set_attribute_float(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_point_position) { + float3 f = point_position(kg, sd); + return set_attribute_float3(f, type, derivatives, val); + } + else if (name == DeviceStrings::u_point_random) { + float f = point_random(kg, sd); + return set_attribute_float(f, type, derivatives, val); + } +#endif + + else if (name == DeviceStrings::u_normal_map_normal) { + if (sd->type & PRIMITIVE_TRIANGLE) { + float3 f = triangle_smooth_normal_unnormalized(kg, sd, sd->Ng, sd->prim, sd->u, sd->v); + return set_attribute_float3(f, type, derivatives, val); + } + else { + return false; + } + } + + return get_background_attribute(kg, sd, name, type, derivatives, val); +} + +ccl_device_extern bool osl_get_attribute(ccl_private ShaderGlobals *sg, + int derivatives, + DeviceString object_name, + DeviceString name, + int array_lookup, + int index, + TypeDesc type, + ccl_private void *res) +{ + KernelGlobals kg = nullptr; + ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate); + int object; + + if (object_name != DeviceStrings::_emptystring_) { + /* TODO: Get object index from name */ + return false; + } + else { + object = sd->object; + } + + const uint64_t id = name.hash(); + + const AttributeDescriptor desc = find_attribute(kg, object, sd->prim, sd->type, id); + if (desc.offset != ATTR_STD_NOT_FOUND) { + return get_object_attribute(kg, sd, desc, type, derivatives, res); + } + else { + return get_object_standard_attribute(kg, sd, name, type, derivatives, res); + } +} + +#if 0 +ccl_device_extern bool osl_bind_interpolated_param(ccl_private ShaderGlobals *sg, + DeviceString name, + long long type, + int userdata_has_derivs, + ccl_private void *userdata_data, + int symbol_has_derivs, + ccl_private void *symbol_data, + int symbol_data_size, + ccl_private void *userdata_initialized, + int userdata_index) +{ + return false; +} +#endif + +/* Noise */ + +#include "kernel/svm/noise.h" +#include "util/hash.h" + +ccl_device_extern uint osl_hash_ii(int x) +{ + return hash_uint(x); +} + +ccl_device_extern uint osl_hash_if(float x) +{ + return hash_uint(__float_as_uint(x)); +} + +ccl_device_extern uint osl_hash_iff(float x, float y) +{ + return hash_uint2(__float_as_uint(x), __float_as_uint(y)); +} + +ccl_device_extern uint osl_hash_iv(ccl_private const float3 *v) +{ + return hash_uint3(__float_as_uint(v->x), __float_as_uint(v->y), __float_as_uint(v->z)); +} + +ccl_device_extern uint osl_hash_ivf(ccl_private const float3 *v, float w) +{ + return hash_uint4( + __float_as_uint(v->x), __float_as_uint(v->y), __float_as_uint(v->z), __float_as_uint(w)); +} + +ccl_device_extern OSLNoiseOptions *osl_get_noise_options(ccl_private ShaderGlobals *sg) +{ + return nullptr; +} + +ccl_device_extern void osl_noiseparams_set_anisotropic(ccl_private OSLNoiseOptions *opt, + int anisotropic) +{ +} + +ccl_device_extern void osl_noiseparams_set_do_filter(ccl_private OSLNoiseOptions *opt, + int do_filter) +{ +} + +ccl_device_extern void osl_noiseparams_set_direction(ccl_private OSLNoiseOptions *opt, + float3 *direction) +{ +} + +ccl_device_extern void osl_noiseparams_set_bandwidth(ccl_private OSLNoiseOptions *opt, + float bandwidth) +{ +} + +ccl_device_extern void osl_noiseparams_set_impulses(ccl_private OSLNoiseOptions *opt, + float impulses) +{ +} + +#define OSL_NOISE_IMPL(name, op) \ + ccl_device_extern float name##_ff(float x) \ + { \ + return op##_1d(x); \ + } \ + ccl_device_extern float name##_fff(float x, float y) \ + { \ + return op##_2d(make_float2(x, y)); \ + } \ + ccl_device_extern float name##_fv(ccl_private const float3 *v) \ + { \ + return op##_3d(*v); \ + } \ + ccl_device_extern float name##_fvf(ccl_private const float3 *v, float w) \ + { \ + return op##_4d(make_float4(v->x, v->y, v->z, w)); \ + } \ + ccl_device_extern void name##_vf(ccl_private float3 *res, float x) \ + { \ + /* TODO: This is not correct. Really need to change the hash function inside the noise \ + * function to spit out a vector instead of a scalar. */ \ + const float n = name##_ff(x); \ + res->x = n; \ + res->y = n; \ + res->z = n; \ + } \ + ccl_device_extern void name##_vff(ccl_private float3 *res, float x, float y) \ + { \ + const float n = name##_fff(x, y); \ + res->x = n; \ + res->y = n; \ + res->z = n; \ + } \ + ccl_device_extern void name##_vv(ccl_private float3 *res, const float3 *v) \ + { \ + const float n = name##_fv(v); \ + res->x = n; \ + res->y = n; \ + res->z = n; \ + } \ + ccl_device_extern void name##_vvf(ccl_private float3 *res, const float3 *v, float w) \ + { \ + const float n = name##_fvf(v, w); \ + res->x = n; \ + res->y = n; \ + res->z = n; \ + } + +ccl_device_forceinline float hashnoise_1d(float p) +{ + const uint x = __float_as_uint(p); + return hash_uint(x) / static_cast<float>(~0u); +} +ccl_device_forceinline float hashnoise_2d(float2 p) +{ + const uint x = __float_as_uint(p.x); + const uint y = __float_as_uint(p.y); + return hash_uint2(x, y) / static_cast<float>(~0u); +} +ccl_device_forceinline float hashnoise_3d(float3 p) +{ + const uint x = __float_as_uint(p.x); + const uint y = __float_as_uint(p.y); + const uint z = __float_as_uint(p.z); + return hash_uint3(x, y, z) / static_cast<float>(~0u); +} +ccl_device_forceinline float hashnoise_4d(float4 p) +{ + const uint x = __float_as_uint(p.x); + const uint y = __float_as_uint(p.y); + const uint z = __float_as_uint(p.z); + const uint w = __float_as_uint(p.w); + return hash_uint4(x, y, z, w) / static_cast<float>(~0u); +} + +/* TODO: Implement all noise functions */ +OSL_NOISE_IMPL(osl_hashnoise, hashnoise) +OSL_NOISE_IMPL(osl_noise, noise) +OSL_NOISE_IMPL(osl_snoise, snoise) + +/* Texturing */ + +ccl_device_extern ccl_private OSLTextureOptions *osl_get_texture_options( + ccl_private ShaderGlobals *sg) +{ + return nullptr; +} + +ccl_device_extern void osl_texture_set_firstchannel(ccl_private OSLTextureOptions *opt, + int firstchannel) +{ +} + +ccl_device_extern void osl_texture_set_swrap_code(ccl_private OSLTextureOptions *opt, int mode) +{ +} + +ccl_device_extern void osl_texture_set_twrap_code(ccl_private OSLTextureOptions *opt, int mode) +{ +} + +ccl_device_extern void osl_texture_set_rwrap_code(ccl_private OSLTextureOptions *opt, int mode) +{ +} + +ccl_device_extern void osl_texture_set_stwrap_code(ccl_private OSLTextureOptions *opt, int mode) +{ +} + +ccl_device_extern void osl_texture_set_sblur(ccl_private OSLTextureOptions *opt, float blur) +{ +} + +ccl_device_extern void osl_texture_set_tblur(ccl_private OSLTextureOptions *opt, float blur) +{ +} + +ccl_device_extern void osl_texture_set_rblur(ccl_private OSLTextureOptions *opt, float blur) +{ +} + +ccl_device_extern void osl_texture_set_stblur(ccl_private OSLTextureOptions *opt, float blur) +{ +} + +ccl_device_extern void osl_texture_set_swidth(ccl_private OSLTextureOptions *opt, float width) +{ +} + +ccl_device_extern void osl_texture_set_twidth(ccl_private OSLTextureOptions *opt, float width) +{ +} + +ccl_device_extern void osl_texture_set_rwidth(ccl_private OSLTextureOptions *opt, float width) +{ +} + +ccl_device_extern void osl_texture_set_stwidth(ccl_private OSLTextureOptions *opt, float width) +{ +} + +ccl_device_extern void osl_texture_set_fill(ccl_private OSLTextureOptions *opt, float fill) +{ +} + +ccl_device_extern void osl_texture_set_time(ccl_private OSLTextureOptions *opt, float time) +{ +} + +ccl_device_extern void osl_texture_set_interp_code(ccl_private OSLTextureOptions *opt, int mode) +{ +} + +ccl_device_extern void osl_texture_set_subimage(ccl_private OSLTextureOptions *opt, int subimage) +{ +} + +ccl_device_extern void osl_texture_set_missingcolor_arena(ccl_private OSLTextureOptions *opt, + ccl_private float3 *color) +{ +} + +ccl_device_extern void osl_texture_set_missingcolor_alpha(ccl_private OSLTextureOptions *opt, + int nchannels, + float alpha) +{ +} + +ccl_device_extern bool osl_texture(ccl_private ShaderGlobals *sg, + DeviceString filename, + ccl_private void *texture_handle, + OSLTextureOptions *opt, + float s, + float t, + float dsdx, + float dtdx, + float dsdy, + float dtdy, + int nchannels, + ccl_private float *result, + ccl_private float *dresultdx, + ccl_private float *dresultdy, + ccl_private float *alpha, + ccl_private float *dalphadx, + ccl_private float *dalphady, + ccl_private void *errormessage) +{ + if (!texture_handle) { + return false; + } + + /* Only SVM textures are supported. */ + int id = static_cast<int>(reinterpret_cast<size_t>(texture_handle) - 1); + + const float4 rgba = kernel_tex_image_interp(nullptr, id, s, 1.0f - t); + + result[0] = rgba.x; + if (nchannels > 1) + result[1] = rgba.y; + if (nchannels > 2) + result[2] = rgba.z; + if (nchannels > 3) + result[3] = rgba.w; + + return true; +} + +ccl_device_extern bool osl_texture3d(ccl_private ShaderGlobals *sg, + DeviceString filename, + ccl_private void *texture_handle, + OSLTextureOptions *opt, + ccl_private const float3 *P, + ccl_private const float3 *dPdx, + ccl_private const float3 *dPdy, + ccl_private const float3 *dPdz, + int nchannels, + ccl_private float *result, + ccl_private float *dresultds, + ccl_private float *dresultdt, + ccl_private float *alpha, + ccl_private float *dalphadx, + ccl_private float *dalphady, + ccl_private void *errormessage) +{ + if (!texture_handle) { + return false; + } + + /* Only SVM textures are supported. */ + int id = static_cast<int>(reinterpret_cast<size_t>(texture_handle) - 1); + + const float4 rgba = kernel_tex_image_interp_3d(nullptr, id, *P, INTERPOLATION_NONE); + + result[0] = rgba.x; + if (nchannels > 1) + result[1] = rgba.y; + if (nchannels > 2) + result[2] = rgba.z; + if (nchannels > 3) + result[3] = rgba.w; + + return true; +} + +ccl_device_extern bool osl_environment(ccl_private ShaderGlobals *sg, + DeviceString filename, + ccl_private void *texture_handle, + OSLTextureOptions *opt, + ccl_private const float3 *R, + ccl_private const float3 *dRdx, + ccl_private const float3 *dRdy, + int nchannels, + ccl_private float *result, + ccl_private float *dresultds, + ccl_private float *dresultdt, + ccl_private float *alpha, + ccl_private float *dalphax, + ccl_private float *dalphay, + ccl_private void *errormessage) +{ + result[0] = 1.0f; + if (nchannels > 1) + result[1] = 0.0f; + if (nchannels > 2) + result[2] = 1.0f; + if (nchannels > 3) + result[3] = 1.0f; + + return false; +} + +ccl_device_extern bool osl_get_textureinfo(ccl_private ShaderGlobals *sg, + DeviceString filename, + ccl_private void *texture_handle, + DeviceString dataname, + int basetype, + int arraylen, + int aggegrate, + ccl_private void *data, + ccl_private void *errormessage) +{ + return false; +} + +ccl_device_extern bool osl_get_textureinfo_st(ccl_private ShaderGlobals *sg, + DeviceString filename, + ccl_private void *texture_handle, + float s, + float t, + DeviceString dataname, + int basetype, + int arraylen, + int aggegrate, + ccl_private void *data, + ccl_private void *errormessage) +{ + return osl_get_textureinfo( + sg, filename, texture_handle, dataname, basetype, arraylen, aggegrate, data, errormessage); +} + +/* Standard library */ + +#define OSL_OP_IMPL_II(name, op) \ + ccl_device_extern int name##_ii(int a) \ + { \ + return op(a); \ + } +#define OSL_OP_IMPL_IF(name, op) \ + ccl_device_extern int name##_if(float a) \ + { \ + return op(a); \ + } +#define OSL_OP_IMPL_FF(name, op) \ + ccl_device_extern float name##_ff(float a) \ + { \ + return op(a); \ + } +#define OSL_OP_IMPL_DFDF(name, op) \ + ccl_device_extern void name##_dfdf(ccl_private float *res, ccl_private const float *a) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i]); \ + } \ + } +#define OSL_OP_IMPL_DFDV(name, op) \ + ccl_device_extern void name##_dfdv(ccl_private float *res, ccl_private const float3 *a) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i]); \ + } \ + } +#define OSL_OP_IMPL_FV(name, op) \ + ccl_device_extern float name##_fv(ccl_private const float3 *a) \ + { \ + return op(*a); \ + } +#define OSL_OP_IMPL_VV(name, op) \ + ccl_device_extern void name##_vv(ccl_private float3 *res, ccl_private const float3 *a) \ + { \ + *res = op(*a); \ + } +#define OSL_OP_IMPL_VV_(name, op) \ + ccl_device_extern void name##_vv(ccl_private float3 *res, ccl_private const float3 *a) \ + { \ + res->x = op(a->x); \ + res->y = op(a->y); \ + res->z = op(a->z); \ + } +#define OSL_OP_IMPL_DVDV(name, op) \ + ccl_device_extern void name##_dvdv(ccl_private float3 *res, ccl_private const float3 *a) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i]); \ + } \ + } +#define OSL_OP_IMPL_DVDV_(name, op) \ + ccl_device_extern void name##_dvdv(ccl_private float3 *res, ccl_private const float3 *a) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i].x = op(a[i].x); \ + res[i].y = op(a[i].y); \ + res[i].z = op(a[i].z); \ + } \ + } + +#define OSL_OP_IMPL_III(name, op) \ + ccl_device_extern int name##_iii(int a, int b) \ + { \ + return op(a, b); \ + } +#define OSL_OP_IMPL_FFF(name, op) \ + ccl_device_extern float name##_fff(float a, float b) \ + { \ + return op(a, b); \ + } +#define OSL_OP_IMPL_FVV(name, op) \ + ccl_device_extern float name##_fvv(ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + return op(*a, *b); \ + } +#define OSL_OP_IMPL_DFFDF(name, op) \ + ccl_device_extern void name##_dffdf( \ + ccl_private float *res, float a, ccl_private const float *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a, b[i]); \ + } \ + } +#define OSL_OP_IMPL_DFDFF(name, op) \ + ccl_device_extern void name##_dfdff( \ + ccl_private float *res, ccl_private const float *a, float b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i], b); \ + } \ + } +#define OSL_OP_IMPL_DFDFDF(name, op) \ + ccl_device_extern void name##_dfdfdf( \ + ccl_private float *res, ccl_private const float *a, ccl_private const float *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i], b[i]); \ + } \ + } +#define OSL_OP_IMPL_DFVDV(name, op) \ + ccl_device_extern void name##_dfvdv( \ + ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[0], b[i]); \ + } \ + } +#define OSL_OP_IMPL_DFDVV(name, op) \ + ccl_device_extern void name##_dfdvv( \ + ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i], b[0]); \ + } \ + } +#define OSL_OP_IMPL_DFDVDV(name, op) \ + ccl_device_extern void name##_dfdvdv( \ + ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i], b[i]); \ + } \ + } +#define OSL_OP_IMPL_VVF_(name, op) \ + ccl_device_extern void name##_vvf( \ + ccl_private float3 *res, ccl_private const float3 *a, float b) \ + { \ + res->x = op(a->x, b); \ + res->y = op(a->y, b); \ + res->z = op(a->z, b); \ + } +#define OSL_OP_IMPL_VVV(name, op) \ + ccl_device_extern void name##_vvv( \ + ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + *res = op(*a, *b); \ + } +#define OSL_OP_IMPL_VVV_(name, op) \ + ccl_device_extern void name##_vvv( \ + ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + res->x = op(a->x, b->x); \ + res->y = op(a->y, b->y); \ + res->z = op(a->z, b->z); \ + } +#define OSL_OP_IMPL_DVVDF_(name, op) \ + ccl_device_extern void name##_dvvdf( \ + ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i].x = op(a[0].x, b[i]); \ + res[i].y = op(a[0].y, b[i]); \ + res[i].z = op(a[0].z, b[i]); \ + } \ + } +#define OSL_OP_IMPL_DVDVF_(name, op) \ + ccl_device_extern void name##_dvdvf( \ + ccl_private float3 *res, ccl_private const float3 *a, float b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i].x = op(a[i].x, b); \ + res[i].y = op(a[i].y, b); \ + res[i].z = op(a[i].z, b); \ + } \ + } +#define OSL_OP_IMPL_DVVDV(name, op) \ + ccl_device_extern void name##_dvvdv( \ + ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[0], b[i]); \ + } \ + } +#define OSL_OP_IMPL_DVVDV_(name, op) \ + ccl_device_extern void name##_dvvdv( \ + ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i].x = op(a[0].x, b[i].x); \ + res[i].y = op(a[0].y, b[i].y); \ + res[i].z = op(a[0].z, b[i].z); \ + } \ + } +#define OSL_OP_IMPL_DVDVV(name, op) \ + ccl_device_extern void name##_dvdvv( \ + ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i], b[0]); \ + } \ + } +#define OSL_OP_IMPL_DVDVV_(name, op) \ + ccl_device_extern void name##_dvdvv( \ + ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i].x = op(a[i].x, b[0].x); \ + res[i].y = op(a[i].y, b[0].y); \ + res[i].z = op(a[i].z, b[0].z); \ + } \ + } +#define OSL_OP_IMPL_DVDVDF_(name, op) \ + ccl_device_extern void name##_dvdvdf( \ + ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i].x = op(a[i].x, b[i]); \ + res[i].y = op(a[i].y, b[i]); \ + res[i].z = op(a[i].z, b[i]); \ + } \ + } +#define OSL_OP_IMPL_DVDVDV(name, op) \ + ccl_device_extern void name##_dvdvdv( \ + ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i], b[i]); \ + } \ + } +#define OSL_OP_IMPL_DVDVDV_(name, op) \ + ccl_device_extern void name##_dvdvdv( \ + ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i].x = op(a[i].x, b[i].x); \ + res[i].y = op(a[i].y, b[i].y); \ + res[i].z = op(a[i].z, b[i].z); \ + } \ + } + +#define OSL_OP_IMPL_FFFF(name, op) \ + ccl_device_extern float name##_ffff(float a, float b, float c) \ + { \ + return op(a, b, c); \ + } +#define OSL_OP_IMPL_DFFFDF(name, op) \ + ccl_device_extern void name##_dfffdf( \ + ccl_private float *res, float a, float b, ccl_private const float *c) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a, b, c[i]); \ + } \ + } +#define OSL_OP_IMPL_DFFDFF(name, op) \ + ccl_device_extern void name##_dffdff( \ + ccl_private float *res, float a, ccl_private const float *b, float c) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a, b[i], c); \ + } \ + } +#define OSL_OP_IMPL_DFFDFDF(name, op) \ + ccl_device_extern void name##_dffdfdf( \ + ccl_private float *res, float a, ccl_private const float *b, ccl_private const float *c) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a, b[i], c[i]); \ + } \ + } + +#define OSL_OP_IMPL_DFDFFF(name, op) \ + ccl_device_extern void name##_dfdfff( \ + ccl_private float *res, ccl_private const float *a, float b, float c) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i], b, c); \ + } \ + } +#define OSL_OP_IMPL_DFDFFDF(name, op) \ + ccl_device_extern void name##_dfdffdf( \ + ccl_private float *res, ccl_private const float *a, float b, ccl_private const float *c) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i], b, c[i]); \ + } \ + } +#define OSL_OP_IMPL_DFDFDFF(name, op) \ + ccl_device_extern void name##_dfdfdff( \ + ccl_private float *res, ccl_private const float *a, ccl_private const float *b, float c) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i], b[i], c); \ + } \ + } +#define OSL_OP_IMPL_DFDFDFDF(name, op) \ + ccl_device_extern void name##_dfdfdfdf(ccl_private float *res, \ + ccl_private const float *a, \ + ccl_private const float *b, \ + ccl_private const float *c) \ + { \ + for (int i = 0; i < 3; ++i) { \ + res[i] = op(a[i], b[i], c[i]); \ + } \ + } + +#define OSL_OP_IMPL_XX(name, op) \ + OSL_OP_IMPL_FF(name, op) \ + OSL_OP_IMPL_DFDF(name, op) \ + OSL_OP_IMPL_VV_(name, op) \ + OSL_OP_IMPL_DVDV_(name, op) + +#define OSL_OP_IMPL_XXX(name, op) \ + OSL_OP_IMPL_FFF(name, op) \ + OSL_OP_IMPL_DFFDF(name, op) \ + OSL_OP_IMPL_DFDFF(name, op) \ + OSL_OP_IMPL_DFDFDF(name, op) \ + OSL_OP_IMPL_VVV_(name, op) \ + OSL_OP_IMPL_DVVDV_(name, op) \ + OSL_OP_IMPL_DVDVV_(name, op) \ + OSL_OP_IMPL_DVDVDV_(name, op) + +OSL_OP_IMPL_XX(osl_acos, acosf) +OSL_OP_IMPL_XX(osl_asin, asinf) +OSL_OP_IMPL_XX(osl_atan, atanf) +OSL_OP_IMPL_XXX(osl_atan2, atan2f) +OSL_OP_IMPL_XX(osl_cos, cosf) +OSL_OP_IMPL_XX(osl_sin, sinf) +OSL_OP_IMPL_XX(osl_tan, tanf) +OSL_OP_IMPL_XX(osl_cosh, coshf) +OSL_OP_IMPL_XX(osl_sinh, sinhf) +OSL_OP_IMPL_XX(osl_tanh, tanhf) + +ccl_device_forceinline int safe_divide(int a, int b) +{ + return (b != 0) ? a / b : 0; +} +ccl_device_forceinline int safe_modulo(int a, int b) +{ + return (b != 0) ? a % b : 0; +} + +OSL_OP_IMPL_III(osl_safe_div, safe_divide) +OSL_OP_IMPL_FFF(osl_safe_div, safe_divide) +OSL_OP_IMPL_III(osl_safe_mod, safe_modulo) + +ccl_device_extern void osl_sincos_fff(float a, ccl_private float *b, ccl_private float *c) +{ + sincos(a, b, c); +} +ccl_device_extern void osl_sincos_dfdff(ccl_private const float *a, + ccl_private float *b, + ccl_private float *c) +{ + for (int i = 0; i < 3; ++i) + sincos(a[i], b + i, c); +} +ccl_device_extern void osl_sincos_dffdf(ccl_private const float *a, + ccl_private float *b, + ccl_private float *c) +{ + for (int i = 0; i < 3; ++i) + sincos(a[i], b, c + i); +} +ccl_device_extern void osl_sincos_dfdfdf(ccl_private const float *a, + ccl_private float *b, + ccl_private float *c) +{ + for (int i = 0; i < 3; ++i) + sincos(a[i], b + i, c + i); +} +ccl_device_extern void osl_sincos_vvv(ccl_private const float3 *a, + ccl_private float3 *b, + ccl_private float3 *c) +{ + sincos(a->x, &b->x, &c->x); + sincos(a->y, &b->y, &c->y); + sincos(a->z, &b->z, &c->z); +} +ccl_device_extern void osl_sincos_dvdvv(ccl_private const float3 *a, + ccl_private float3 *b, + ccl_private float3 *c) +{ + for (int i = 0; i < 3; ++i) { + sincos(a[i].x, &b[i].x, &c->x); + sincos(a[i].y, &b[i].y, &c->y); + sincos(a[i].z, &b[i].z, &c->z); + } +} +ccl_device_extern void osl_sincos_dvvdv(ccl_private const float3 *a, + ccl_private float3 *b, + ccl_private float3 *c) +{ + for (int i = 0; i < 3; ++i) { + sincos(a[i].x, &b->x, &c[i].x); + sincos(a[i].y, &b->y, &c[i].y); + sincos(a[i].z, &b->z, &c[i].z); + } +} +ccl_device_extern void osl_sincos_dvdvdv(ccl_private const float3 *a, + ccl_private float3 *b, + ccl_private float3 *c) +{ + for (int i = 0; i < 3; ++i) { + sincos(a[i].x, &b[i].x, &c[i].x); + sincos(a[i].y, &b[i].y, &c[i].y); + sincos(a[i].z, &b[i].z, &c[i].z); + } +} + +OSL_OP_IMPL_XX(osl_log, logf) +OSL_OP_IMPL_XX(osl_log2, log2f) +OSL_OP_IMPL_XX(osl_log10, log10f) +OSL_OP_IMPL_XX(osl_exp, expf) +OSL_OP_IMPL_XX(osl_exp2, exp2f) +OSL_OP_IMPL_XX(osl_expm1, expm1f) +OSL_OP_IMPL_XX(osl_erf, erff) +OSL_OP_IMPL_XX(osl_erfc, erfcf) + +OSL_OP_IMPL_XXX(osl_pow, safe_powf) +OSL_OP_IMPL_VVF_(osl_pow, safe_powf) +OSL_OP_IMPL_DVVDF_(osl_pow, safe_powf) +OSL_OP_IMPL_DVDVF_(osl_pow, safe_powf) +OSL_OP_IMPL_DVDVDF_(osl_pow, safe_powf) + +OSL_OP_IMPL_XX(osl_sqrt, sqrtf) +OSL_OP_IMPL_XX(osl_inversesqrt, 1.0f / sqrtf) +OSL_OP_IMPL_XX(osl_cbrt, cbrtf) + +OSL_OP_IMPL_FF(osl_logb, logbf) +OSL_OP_IMPL_VV_(osl_logb, logbf) + +OSL_OP_IMPL_FF(osl_floor, floorf) +OSL_OP_IMPL_VV_(osl_floor, floorf) +OSL_OP_IMPL_FF(osl_ceil, ceilf) +OSL_OP_IMPL_VV_(osl_ceil, ceilf) +OSL_OP_IMPL_FF(osl_round, roundf) +OSL_OP_IMPL_VV_(osl_round, roundf) +OSL_OP_IMPL_FF(osl_trunc, truncf) +OSL_OP_IMPL_VV_(osl_trunc, truncf) + +ccl_device_forceinline float step_impl(float edge, float x) +{ + return x < edge ? 0.0f : 1.0f; +} + +OSL_OP_IMPL_FF(osl_sign, compatible_signf) +OSL_OP_IMPL_VV_(osl_sign, compatible_signf) +OSL_OP_IMPL_FFF(osl_step, step_impl) +OSL_OP_IMPL_VVV_(osl_step, step_impl) + +OSL_OP_IMPL_IF(osl_isnan, isnan) +OSL_OP_IMPL_IF(osl_isinf, isinf) +OSL_OP_IMPL_IF(osl_isfinite, isfinite) + +OSL_OP_IMPL_II(osl_abs, abs) +OSL_OP_IMPL_XX(osl_abs, fabsf) +OSL_OP_IMPL_II(osl_fabs, abs) +OSL_OP_IMPL_XX(osl_fabs, fabsf) +OSL_OP_IMPL_XXX(osl_fmod, safe_modulo) + +OSL_OP_IMPL_FFFF(osl_smoothstep, smoothstep) +OSL_OP_IMPL_DFFFDF(osl_smoothstep, smoothstep) +OSL_OP_IMPL_DFFDFF(osl_smoothstep, smoothstep) +OSL_OP_IMPL_DFFDFDF(osl_smoothstep, smoothstep) +OSL_OP_IMPL_DFDFFF(osl_smoothstep, smoothstep) +OSL_OP_IMPL_DFDFFDF(osl_smoothstep, smoothstep) +OSL_OP_IMPL_DFDFDFF(osl_smoothstep, smoothstep) +OSL_OP_IMPL_DFDFDFDF(osl_smoothstep, smoothstep) + +OSL_OP_IMPL_FVV(osl_dot, dot) +OSL_OP_IMPL_DFDVV(osl_dot, dot) +OSL_OP_IMPL_DFVDV(osl_dot, dot) +OSL_OP_IMPL_DFDVDV(osl_dot, dot) +OSL_OP_IMPL_VVV(osl_cross, cross) +OSL_OP_IMPL_DVDVV(osl_cross, cross) +OSL_OP_IMPL_DVVDV(osl_cross, cross) +OSL_OP_IMPL_DVDVDV(osl_cross, cross) +OSL_OP_IMPL_FV(osl_length, len) +OSL_OP_IMPL_DFDV(osl_length, len) +OSL_OP_IMPL_FVV(osl_distance, distance) +OSL_OP_IMPL_DFDVV(osl_distance, distance) +OSL_OP_IMPL_DFVDV(osl_distance, distance) +OSL_OP_IMPL_DFDVDV(osl_distance, distance) +OSL_OP_IMPL_VV(osl_normalize, safe_normalize) +OSL_OP_IMPL_DVDV(osl_normalize, safe_normalize) + +ccl_device_extern void osl_calculatenormal(ccl_private float3 *res, + ccl_private ShaderGlobals *sg, + ccl_private const float3 *p) +{ + if (sg->flipHandedness) + *res = cross(p[2], p[1]); + else + *res = cross(p[1], p[2]); +} + +ccl_device_extern float osl_area(ccl_private const float3 *p) +{ + return len(cross(p[2], p[1])); +} + +ccl_device_extern float osl_filterwidth_fdf(ccl_private const float *x) +{ + return sqrtf(x[1] * x[1] + x[2] * x[2]); +} + +ccl_device_extern void osl_filterwidth_vdv(ccl_private float *res, ccl_private const float *x) +{ + for (int i = 0; i < 3; ++i) + res[i] = osl_filterwidth_fdf(x + i); +} + +ccl_device_extern bool osl_raytype_bit(ccl_private ShaderGlobals *sg, int bit) +{ + return (sg->raytype & bit) != 0; +} diff --git a/intern/cycles/kernel/osl/services_optix.cu b/intern/cycles/kernel/osl/services_optix.cu new file mode 100644 index 00000000000..2a43a89a956 --- /dev/null +++ b/intern/cycles/kernel/osl/services_optix.cu @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#define WITH_OSL + +// clang-format off +#include "kernel/device/optix/compat.h" +#include "kernel/device/optix/globals.h" + +#include "kernel/device/gpu/image.h" /* Texture lookup uses normal CUDA intrinsics. */ + +#include "kernel/osl/services_gpu.h" +// clang-format on + +extern "C" __device__ void __direct_callable__dummy_services() +{ +} diff --git a/intern/cycles/kernel/osl/types.h b/intern/cycles/kernel/osl/types.h index 46e06114360..717306a3d07 100644 --- a/intern/cycles/kernel/osl/types.h +++ b/intern/cycles/kernel/osl/types.h @@ -5,9 +5,53 @@ CCL_NAMESPACE_BEGIN +struct DeviceString { +#if defined(__KERNEL_GPU__) + /* Strings are represented by their hashes in CUDA and OptiX. */ + size_t str_; + + ccl_device_inline_method uint64_t hash() const + { + return str_; + } +#elif defined(OPENIMAGEIO_USTRING_H) + ustring str_; + + ccl_device_inline_method uint64_t hash() const + { + return str_.hash(); + } +#else + const char *str_; +#endif + + ccl_device_inline_method bool operator==(DeviceString b) const + { + return str_ == b.str_; + } + ccl_device_inline_method bool operator!=(DeviceString b) const + { + return str_ != b.str_; + } +}; + +ccl_device_inline DeviceString make_string(const char *str, size_t hash) +{ +#if defined(__KERNEL_GPU__) + (void)str; + return {hash}; +#elif defined(OPENIMAGEIO_USTRING_H) + (void)hash; + return {ustring(str)}; +#else + (void)hash; + return {str}; +#endif +} + /* Closure */ -enum ClosureTypeOSL { +enum OSLClosureType { OSL_CLOSURE_MUL_ID = -1, OSL_CLOSURE_ADD_ID = -2, @@ -17,4 +61,60 @@ enum ClosureTypeOSL { #include "closures_template.h" }; +struct OSLClosure { + OSLClosureType id; +}; + +struct ccl_align(8) OSLClosureMul : public OSLClosure +{ + packed_float3 weight; + ccl_private const OSLClosure *closure; +}; + +struct ccl_align(8) OSLClosureAdd : public OSLClosure +{ + ccl_private const OSLClosure *closureA; + ccl_private const OSLClosure *closureB; +}; + +struct ccl_align(8) OSLClosureComponent : public OSLClosure +{ + packed_float3 weight; +}; + +/* Globals */ + +struct ShaderGlobals { + packed_float3 P, dPdx, dPdy; + packed_float3 dPdz; + packed_float3 I, dIdx, dIdy; + packed_float3 N; + packed_float3 Ng; + float u, dudx, dudy; + float v, dvdx, dvdy; + packed_float3 dPdu, dPdv; + float time; + float dtime; + packed_float3 dPdtime; + packed_float3 Ps, dPsdx, dPsdy; + ccl_private void *renderstate; + ccl_private void *tracedata; + ccl_private void *objdata; + void *context; + void *renderer; + ccl_private void *object2common; + ccl_private void *shader2common; + ccl_private OSLClosure *Ci; + float surfacearea; + int raytype; + int flipHandedness; + int backfacing; +}; + +struct OSLNoiseOptions { +}; + +struct OSLTextureOptions { +}; + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/noise.h b/intern/cycles/kernel/svm/noise.h index 31e77d87413..209195a03f1 100644 --- a/intern/cycles/kernel/svm/noise.h +++ b/intern/cycles/kernel/svm/noise.h @@ -39,11 +39,11 @@ ccl_device_noinline_cpu float perlin_1d(float x) } /* 2D, 3D, and 4D noise can be accelerated using SSE, so we first check if - * SSE is supported, that is, if __KERNEL_SSE2__ is defined. If it is not + * SSE is supported, that is, if __KERNEL_SSE__ is defined. If it is not * supported, we do a standard implementation, but if it is supported, we * do an implementation using SSE intrinsics. */ -#if !defined(__KERNEL_SSE2__) +#if !defined(__KERNEL_SSE__) /* ** Standard Implementation ** */ @@ -250,18 +250,18 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) /* SSE Bilinear Interpolation: * - * The function takes two ssef inputs: + * The function takes two float4 inputs: * - p : Contains the values at the points (v0, v1, v2, v3). * - f : Contains the values (x, y, _, _). The third and fourth values are unused. * * The interpolation is done in two steps: * 1. Interpolate (v0, v1) and (v2, v3) along the x axis to get g (g0, g1). * (v2, v3) is generated by moving v2 and v3 to the first and second - * places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and + * places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and * fourth values are unused. * 2. Interpolate g0 and g1 along the y axis to get the final value. - * g1 is generated by populating an ssef with the second value of g. - * Only the first value is important in the final ssef. + * g1 is generated by populating an float4 with the second value of g. + * Only the first value is important in the final float4. * * v1 v3 g1 * @ + + + + @ @ y @@ -272,27 +272,27 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) * v0 v2 g0 * */ -ccl_device_inline ssef bi_mix(ssef p, ssef f) +ccl_device_inline float4 bi_mix(float4 p, float4 f) { - ssef g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f)); + float4 g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f)); return mix(g, shuffle<1>(g), shuffle<1>(f)); } -ccl_device_inline ssef fade(const ssef &t) +ccl_device_inline float4 fade(const float4 t) { - ssef a = madd(t, 6.0f, -15.0f); - ssef b = madd(t, a, 10.0f); + float4 a = madd(t, make_float4(6.0f), make_float4(-15.0f)); + float4 b = madd(t, a, make_float4(10.0f)); return (t * t) * (t * b); } /* Negate val if the nth bit of h is 1. */ # define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n)))) -ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y) +ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y) { - ssei h = hash & 7; - ssef u = select(h < 4, x, y); - ssef v = 2.0f * select(h < 4, y, x); + int4 h = hash & 7; + float4 u = select(h < 4, x, y); + float4 v = 2.0f * select(h < 4, y, x); return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1); } @@ -310,28 +310,28 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y) */ ccl_device_noinline_cpu float perlin_2d(float x, float y) { - ssei XY; - ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY); - ssef uv = fade(fxy); + int4 XY; + float4 fxy = floorfrac(make_float4(x, y, 0.0f, 0.0f), &XY); + float4 uv = fade(fxy); - ssei XY1 = XY + 1; - ssei X = shuffle<0, 0, 0, 0>(XY, XY1); - ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1)); + int4 XY1 = XY + make_int4(1); + int4 X = shuffle<0, 0, 0, 0>(XY, XY1); + int4 Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1)); - ssei h = hash_ssei2(X, Y); + int4 h = hash_int4_2(X, Y); - ssef fxy1 = fxy - 1.0f; - ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1); - ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1)); + float4 fxy1 = fxy - make_float4(1.0f); + float4 fx = shuffle<0, 0, 0, 0>(fxy, fxy1); + float4 fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1)); - ssef g = grad(h, fx, fy); + float4 g = grad(h, fx, fy); return extract<0>(bi_mix(g, uv)); } /* SSE Trilinear Interpolation: * - * The function takes three ssef inputs: + * The function takes three float4 inputs: * - p : Contains the values at the points (v0, v1, v2, v3). * - q : Contains the values at the points (v4, v5, v6, v7). * - f : Contains the values (x, y, z, _). The fourth value is unused. @@ -340,11 +340,11 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y) * 1. Interpolate p and q along the x axis to get s (s0, s1, s2, s3). * 2. Interpolate (s0, s1) and (s2, s3) along the y axis to get g (g0, g1). * (s2, s3) is generated by moving v2 and v3 to the first and second - * places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and + * places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and * fourth values are unused. * 3. Interpolate g0 and g1 along the z axis to get the final value. - * g1 is generated by populating an ssef with the second value of g. - * Only the first value is important in the final ssef. + * g1 is generated by populating an float4 with the second value of g. + * Only the first value is important in the final float4. * * v3 v7 * @ + + + + + + @ s3 @ @@ -362,10 +362,10 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y) * @ + + + + + + @ @ * v0 v4 s0 */ -ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f) +ccl_device_inline float4 tri_mix(float4 p, float4 q, float4 f) { - ssef s = mix(p, q, shuffle<0>(f)); - ssef g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f)); + float4 s = mix(p, q, shuffle<0>(f)); + float4 g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f)); return mix(g, shuffle<1>(g), shuffle<2>(f)); } @@ -374,24 +374,24 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f) * supported, we do an SSE implementation, but if it is supported, * we do an implementation using AVX intrinsics. */ -# if !defined(__KERNEL_AVX__) +# if !defined(__KERNEL_AVX2__) -ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z) +ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y, const float4 z) { - ssei h = hash & 15; - ssef u = select(h < 8, x, y); - ssef vt = select((h == 12) | (h == 14), x, z); - ssef v = select(h < 4, y, vt); + int4 h = hash & 15; + float4 u = select(h < 8, x, y); + float4 vt = select((h == 12) | (h == 14), x, z); + float4 v = select(h < 4, y, vt); return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1); } -ccl_device_inline ssef -grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &w) +ccl_device_inline float4 +grad(const int4 hash, const float4 x, const float4 y, const float4 z, const float4 w) { - ssei h = hash & 31; - ssef u = select(h < 24, x, y); - ssef v = select(h < 16, y, z); - ssef s = select(h < 8, z, w); + int4 h = hash & 31; + float4 u = select(h < 24, x, y); + float4 v = select(h < 16, y, z); + float4 s = select(h < 8, z, w); return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2); } @@ -401,7 +401,7 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef & * between two trilinear interpolations. * */ -ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f) +ccl_device_inline float4 quad_mix(float4 p, float4 q, float4 r, float4 s, float4 f) { return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f)); } @@ -427,23 +427,23 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f) */ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z) { - ssei XYZ; - ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ); - ssef uvw = fade(fxyz); + int4 XYZ; + float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ); + float4 uvw = fade(fxyz); - ssei XYZ1 = XYZ + 1; - ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1); - ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1)); + int4 XYZ1 = XYZ + make_int4(1); + int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1); + int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1)); - ssei h1 = hash_ssei3(shuffle<0>(XYZ), Y, Z); - ssei h2 = hash_ssei3(shuffle<0>(XYZ1), Y, Z); + int4 h1 = hash_int4_3(shuffle<0>(XYZ), Y, Z); + int4 h2 = hash_int4_3(shuffle<0>(XYZ1), Y, Z); - ssef fxyz1 = fxyz - 1.0f; - ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1); - ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1)); + float4 fxyz1 = fxyz - make_float4(1.0f); + float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1); + float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1)); - ssef g1 = grad(h1, shuffle<0>(fxyz), fy, fz); - ssef g2 = grad(h2, shuffle<0>(fxyz1), fy, fz); + float4 g1 = grad(h1, shuffle<0>(fxyz), fy, fz); + float4 g2 = grad(h2, shuffle<0>(fxyz1), fy, fz); return extract<0>(tri_mix(g1, g2, uvw)); } @@ -481,29 +481,29 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z) */ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) { - ssei XYZW; - ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW); - ssef uvws = fade(fxyzw); + int4 XYZW; + float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW); + float4 uvws = fade(fxyzw); - ssei XYZW1 = XYZW + 1; - ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1); - ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1)); + int4 XYZW1 = XYZW + make_int4(1); + int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1); + int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1)); - ssei h1 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW)); - ssei h2 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW)); + int4 h1 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW)); + int4 h2 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW)); - ssei h3 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1)); - ssei h4 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1)); + int4 h3 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1)); + int4 h4 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1)); - ssef fxyzw1 = fxyzw - 1.0f; - ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1); - ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1)); + float4 fxyzw1 = fxyzw - make_float4(1.0f); + float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1); + float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1)); - ssef g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw)); - ssef g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw)); + float4 g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw)); + float4 g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw)); - ssef g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1)); - ssef g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1)); + float4 g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1)); + float4 g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1)); return extract<0>(quad_mix(g1, g2, g3, g4, uvws)); } @@ -512,22 +512,22 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) /* AVX Implementation */ -ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z) +ccl_device_inline vfloat8 grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z) { - avxi h = hash & 15; - avxf u = select(h < 8, x, y); - avxf vt = select((h == 12) | (h == 14), x, z); - avxf v = select(h < 4, y, vt); + vint8 h = hash & 15; + vfloat8 u = select(h < 8, x, y); + vfloat8 vt = select((h == 12) | (h == 14), x, z); + vfloat8 v = select(h < 4, y, vt); return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1); } -ccl_device_inline avxf -grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w) +ccl_device_inline vfloat8 +grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z, const vfloat8 w) { - avxi h = hash & 31; - avxf u = select(h < 24, x, y); - avxf v = select(h < 16, y, z); - avxf s = select(h < 8, z, w); + vint8 h = hash & 31; + vfloat8 u = select(h < 24, x, y); + vfloat8 v = select(h < 16, y, z); + vfloat8 s = select(h < 8, z, w); return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2); } @@ -537,13 +537,13 @@ grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf & * 1. Interpolate p and q along the w axis to get s. * 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final * value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the - * low and high ssef from s. + * low and high float4 from s. * */ -ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f) +ccl_device_inline float4 quad_mix(vfloat8 p, vfloat8 q, float4 f) { - ssef fv = shuffle<3>(f); - avxf s = mix(p, q, avxf(fv, fv)); + float4 fv = shuffle<3>(f); + vfloat8 s = mix(p, q, make_vfloat8(fv, fv)); return tri_mix(low(s), high(s), f); } @@ -565,25 +565,25 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f) */ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z) { - ssei XYZ; - ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ); - ssef uvw = fade(fxyz); + int4 XYZ; + float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ); + float4 uvw = fade(fxyz); - ssei XYZ1 = XYZ + 1; - ssei X = shuffle<0>(XYZ); - ssei X1 = shuffle<0>(XYZ1); - ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1); - ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1)); + int4 XYZ1 = XYZ + make_int4(1); + int4 X = shuffle<0>(XYZ); + int4 X1 = shuffle<0>(XYZ1); + int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1); + int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1)); - avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z)); + vint8 h = hash_int8_3(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z)); - ssef fxyz1 = fxyz - 1.0f; - ssef fx = shuffle<0>(fxyz); - ssef fx1 = shuffle<0>(fxyz1); - ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1); - ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1)); + float4 fxyz1 = fxyz - make_float4(1.0f); + float4 fx = shuffle<0>(fxyz); + float4 fx1 = shuffle<0>(fxyz1); + float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1); + float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1)); - avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz)); + vfloat8 g = grad(h, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz)); return extract<0>(tri_mix(low(g), high(g), uvw)); } @@ -617,31 +617,37 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z) */ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) { - ssei XYZW; - ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW); - ssef uvws = fade(fxyzw); - - ssei XYZW1 = XYZW + 1; - ssei X = shuffle<0>(XYZW); - ssei X1 = shuffle<0>(XYZW1); - ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1); - ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1)); - ssei W = shuffle<3>(XYZW); - ssei W1 = shuffle<3>(XYZW1); - - avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W)); - avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1)); - - ssef fxyzw1 = fxyzw - 1.0f; - ssef fx = shuffle<0>(fxyzw); - ssef fx1 = shuffle<0>(fxyzw1); - ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1); - ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1)); - ssef fw = shuffle<3>(fxyzw); - ssef fw1 = shuffle<3>(fxyzw1); - - avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw)); - avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1)); + int4 XYZW; + float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW); + float4 uvws = fade(fxyzw); + + int4 XYZW1 = XYZW + make_int4(1); + int4 X = shuffle<0>(XYZW); + int4 X1 = shuffle<0>(XYZW1); + int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1); + int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1)); + int4 W = shuffle<3>(XYZW); + int4 W1 = shuffle<3>(XYZW1); + + vint8 h1 = hash_int8_4(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W, W)); + vint8 h2 = hash_int8_4( + make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W1, W1)); + + float4 fxyzw1 = fxyzw - make_float4(1.0f); + float4 fx = shuffle<0>(fxyzw); + float4 fx1 = shuffle<0>(fxyzw1); + float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1); + float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1)); + float4 fw = shuffle<3>(fxyzw); + float4 fw1 = shuffle<3>(fxyzw1); + + vfloat8 g1 = grad( + h1, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz), make_vfloat8(fw, fw)); + vfloat8 g2 = grad(h2, + make_vfloat8(fx, fx1), + make_vfloat8(fy, fy), + make_vfloat8(fz, fz), + make_vfloat8(fw1, fw1)); return extract<0>(quad_mix(g1, g2, uvws)); } diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 24c5a6a4540..a6f8914a9b8 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -75,10 +75,14 @@ CCL_NAMESPACE_BEGIN #define __VOLUME__ /* Device specific features */ -#ifndef __KERNEL_GPU__ -# ifdef WITH_OSL -# define __OSL__ +#ifdef WITH_OSL +# define __OSL__ +# ifdef __KERNEL_OPTIX__ +/* Kernels with OSL support are built separately in OptiX and don't need SVM. */ +# undef __SVM__ # endif +#endif +#ifndef __KERNEL_GPU__ # ifdef WITH_PATH_GUIDING # define __PATH_GUIDING__ # endif @@ -917,9 +921,13 @@ typedef struct ccl_align(16) ShaderData float ray_dP; #ifdef __OSL__ +# ifdef __KERNEL_GPU__ + ccl_private uint8_t *osl_closure_pool; +# else const struct KernelGlobalsCPU *osl_globals; const struct IntegratorStateCPU *osl_path_state; const struct IntegratorShadowStateCPU *osl_shadow_path_state; +# endif #endif /* LCG state for closures that require additional random numbers. */ @@ -1529,6 +1537,9 @@ enum KernelFeatureFlag : uint32_t { /* Path guiding. */ KERNEL_FEATURE_PATH_GUIDING = (1U << 26U), + + /* OSL. */ + KERNEL_FEATURE_OSL = (1U << 27U), }; /* Shader node feature mask, to specialize shader evaluation for kernels. */ diff --git a/intern/cycles/scene/osl.cpp b/intern/cycles/scene/osl.cpp index 93839facdbe..4dc5fb4edf7 100644 --- a/intern/cycles/scene/osl.cpp +++ b/intern/cycles/scene/osl.cpp @@ -38,16 +38,17 @@ OSL::TextureSystem *OSLShaderManager::ts_shared = NULL; int OSLShaderManager::ts_shared_users = 0; thread_mutex OSLShaderManager::ts_shared_mutex; -OSL::ShadingSystem *OSLShaderManager::ss_shared = NULL; -OSLRenderServices *OSLShaderManager::services_shared = NULL; +OSL::ErrorHandler OSLShaderManager::errhandler; +map<int, OSL::ShadingSystem *> OSLShaderManager::ss_shared; int OSLShaderManager::ss_shared_users = 0; thread_mutex OSLShaderManager::ss_shared_mutex; thread_mutex OSLShaderManager::ss_mutex; + int OSLCompiler::texture_shared_unique_id = 0; /* Shader Manager */ -OSLShaderManager::OSLShaderManager() +OSLShaderManager::OSLShaderManager(Device *device) : device_(device) { texture_system_init(); shading_system_init(); @@ -107,11 +108,12 @@ void OSLShaderManager::device_update_specific(Device *device, device_free(device, dscene, scene); - /* set texture system */ - scene->image_manager->set_osl_texture_system((void *)ts); + /* set texture system (only on CPU devices, since GPU devices cannot use OIIO) */ + if (device->info.type == DEVICE_CPU) { + scene->image_manager->set_osl_texture_system((void *)ts_shared); + } /* create shaders */ - OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory(); Shader *background_shader = scene->background->get_shader(scene); foreach (Shader *shader, scene->shaders) { @@ -125,22 +127,34 @@ void OSLShaderManager::device_update_specific(Device *device, * compile shaders alternating */ thread_scoped_lock lock(ss_mutex); - OSLCompiler compiler(this, services, ss, scene); - compiler.background = (shader == background_shader); - compiler.compile(og, shader); + device->foreach_device( + [this, scene, shader, background = (shader == background_shader)](Device *sub_device) { + OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory(); + OSL::ShadingSystem *ss = ss_shared[sub_device->info.type]; + + OSLCompiler compiler(this, ss, scene); + compiler.background = background; + compiler.compile(og, shader); + }); if (shader->get_use_mis() && shader->has_surface_emission) scene->light_manager->tag_update(scene, LightManager::SHADER_COMPILED); } /* setup shader engine */ - og->ss = ss; - og->ts = ts; - og->services = services; - int background_id = scene->shader_manager->get_shader_id(background_shader); - og->background_state = og->surface_state[background_id & SHADER_MASK]; - og->use = true; + + device->foreach_device([background_id](Device *sub_device) { + OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory(); + OSL::ShadingSystem *ss = ss_shared[sub_device->info.type]; + + og->ss = ss; + og->ts = ts_shared; + og->services = static_cast<OSLRenderServices *>(ss->renderer()); + + og->background_state = og->surface_state[background_id & SHADER_MASK]; + og->use = true; + }); foreach (Shader *shader, scene->shaders) shader->clear_modified(); @@ -148,8 +162,12 @@ void OSLShaderManager::device_update_specific(Device *device, update_flags = UPDATE_NONE; /* add special builtin texture types */ - services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO)); - services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL)); + for (const auto &[device_type, ss] : ss_shared) { + OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer()); + + services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO)); + services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL)); + } device_update_common(device, dscene, scene, progress); @@ -166,26 +184,35 @@ void OSLShaderManager::device_update_specific(Device *device, * is being freed after the Session is freed. */ thread_scoped_lock lock(ss_shared_mutex); - ss->optimize_all_groups(); + for (const auto &[device_type, ss] : ss_shared) { + ss->optimize_all_groups(); + } + } + + /* load kernels */ + if (!device->load_osl_kernels()) { + progress.set_error(device->error_message()); } } void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene) { - OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory(); - device_free_common(device, dscene, scene); /* clear shader engine */ - og->use = false; - og->ss = NULL; - og->ts = NULL; - - og->surface_state.clear(); - og->volume_state.clear(); - og->displacement_state.clear(); - og->bump_state.clear(); - og->background_state.reset(); + device->foreach_device([](Device *sub_device) { + OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory(); + + og->use = false; + og->ss = NULL; + og->ts = NULL; + + og->surface_state.clear(); + og->volume_state.clear(); + og->displacement_state.clear(); + og->bump_state.clear(); + og->background_state.reset(); + }); } void OSLShaderManager::texture_system_init() @@ -193,7 +220,7 @@ void OSLShaderManager::texture_system_init() /* create texture system, shared between different renders to reduce memory usage */ thread_scoped_lock lock(ts_shared_mutex); - if (ts_shared_users == 0) { + if (ts_shared_users++ == 0) { ts_shared = TextureSystem::create(true); ts_shared->attribute("automip", 1); @@ -203,24 +230,18 @@ void OSLShaderManager::texture_system_init() /* effectively unlimited for now, until we support proper mipmap lookups */ ts_shared->attribute("max_memory_MB", 16384); } - - ts = ts_shared; - ts_shared_users++; } void OSLShaderManager::texture_system_free() { /* shared texture system decrease users and destroy if no longer used */ thread_scoped_lock lock(ts_shared_mutex); - ts_shared_users--; - if (ts_shared_users == 0) { + if (--ts_shared_users == 0) { ts_shared->invalidate_all(true); OSL::TextureSystem::destroy(ts_shared); ts_shared = NULL; } - - ts = NULL; } void OSLShaderManager::shading_system_init() @@ -228,101 +249,105 @@ void OSLShaderManager::shading_system_init() /* create shading system, shared between different renders to reduce memory usage */ thread_scoped_lock lock(ss_shared_mutex); - if (ss_shared_users == 0) { - /* Must use aligned new due to concurrent hash map. */ - services_shared = util_aligned_new<OSLRenderServices>(ts_shared); + device_->foreach_device([](Device *sub_device) { + const DeviceType device_type = sub_device->info.type; - string shader_path = path_get("shader"); + if (ss_shared_users++ == 0 || ss_shared.find(device_type) == ss_shared.end()) { + /* Must use aligned new due to concurrent hash map. */ + OSLRenderServices *services = util_aligned_new<OSLRenderServices>(ts_shared, device_type); + + string shader_path = path_get("shader"); # ifdef _WIN32 - /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can - * operate with file paths with any character. This requires to use wide - * char functions, but OSL uses old fashioned ANSI functions which means: - * - * - We have to convert our paths to ANSI before passing to OSL - * - OSL can't be used when there's a multi-byte character in the path - * to the shaders folder. - */ - shader_path = string_to_ansi(shader_path); + /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can + * operate with file paths with any character. This requires to use wide + * char functions, but OSL uses old fashioned ANSI functions which means: + * + * - We have to convert our paths to ANSI before passing to OSL + * - OSL can't be used when there's a multi-byte character in the path + * to the shaders folder. + */ + shader_path = string_to_ansi(shader_path); # endif - ss_shared = new OSL::ShadingSystem(services_shared, ts_shared, &errhandler); - ss_shared->attribute("lockgeom", 1); - ss_shared->attribute("commonspace", "world"); - ss_shared->attribute("searchpath:shader", shader_path); - ss_shared->attribute("greedyjit", 1); - - VLOG_INFO << "Using shader search path: " << shader_path; - - /* our own ray types */ - static const char *raytypes[] = { - "camera", /* PATH_RAY_CAMERA */ - "reflection", /* PATH_RAY_REFLECT */ - "refraction", /* PATH_RAY_TRANSMIT */ - "diffuse", /* PATH_RAY_DIFFUSE */ - "glossy", /* PATH_RAY_GLOSSY */ - "singular", /* PATH_RAY_SINGULAR */ - "transparent", /* PATH_RAY_TRANSPARENT */ - "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */ - - "shadow", /* PATH_RAY_SHADOW_OPAQUE */ - "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */ - - "__unused__", /* PATH_RAY_NODE_UNALIGNED */ - "__unused__", /* PATH_RAY_MIS_SKIP */ - - "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */ - - /* Remaining irrelevant bits up to 32. */ - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - "__unused__", - }; - - const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]); - ss_shared->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes); - - OSLRenderServices::register_closures(ss_shared); - - loaded_shaders.clear(); - } + OSL::ShadingSystem *ss = new OSL::ShadingSystem(services, ts_shared, &errhandler); + ss->attribute("lockgeom", 1); + ss->attribute("commonspace", "world"); + ss->attribute("searchpath:shader", shader_path); + ss->attribute("greedyjit", 1); + + VLOG_INFO << "Using shader search path: " << shader_path; + + /* our own ray types */ + static const char *raytypes[] = { + "camera", /* PATH_RAY_CAMERA */ + "reflection", /* PATH_RAY_REFLECT */ + "refraction", /* PATH_RAY_TRANSMIT */ + "diffuse", /* PATH_RAY_DIFFUSE */ + "glossy", /* PATH_RAY_GLOSSY */ + "singular", /* PATH_RAY_SINGULAR */ + "transparent", /* PATH_RAY_TRANSPARENT */ + "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */ + + "shadow", /* PATH_RAY_SHADOW_OPAQUE */ + "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */ + + "__unused__", /* PATH_RAY_NODE_UNALIGNED */ + "__unused__", /* PATH_RAY_MIS_SKIP */ + + "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */ + + /* Remaining irrelevant bits up to 32. */ + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", + }; + + const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]); + ss->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes); + + OSLRenderServices::register_closures(ss); + + ss_shared[device_type] = ss; + } + }); - ss = ss_shared; - services = services_shared; - ss_shared_users++; + loaded_shaders.clear(); } void OSLShaderManager::shading_system_free() { /* shared shading system decrease users and destroy if no longer used */ thread_scoped_lock lock(ss_shared_mutex); - ss_shared_users--; - if (ss_shared_users == 0) { - delete ss_shared; - ss_shared = NULL; + device_->foreach_device([](Device * /*sub_device*/) { + if (--ss_shared_users == 0) { + for (const auto &[device_type, ss] : ss_shared) { + OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer()); - util_aligned_delete(services_shared); - services_shared = NULL; - } + delete ss; + + util_aligned_delete(services); + } - ss = NULL; - services = NULL; + ss_shared.clear(); + } + }); } bool OSLShaderManager::osl_compile(const string &inputfile, const string &outputfile) @@ -447,7 +472,9 @@ const char *OSLShaderManager::shader_load_filepath(string filepath) const char *OSLShaderManager::shader_load_bytecode(const string &hash, const string &bytecode) { - ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str()); + for (const auto &[device_type, ss] : ss_shared) { + ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str()); + } OSLShaderInfo info; @@ -599,11 +626,11 @@ OSLNode *OSLShaderManager::osl_node(ShaderGraph *graph, /* Graph Compiler */ -OSLCompiler::OSLCompiler(OSLShaderManager *manager, - OSLRenderServices *services, - OSL::ShadingSystem *ss, - Scene *scene) - : scene(scene), manager(manager), services(services), ss(ss) +OSLCompiler::OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *ss, Scene *scene) + : scene(scene), + manager(manager), + services(static_cast<OSLRenderServices *>(ss->renderer())), + ss(ss) { current_type = SHADER_TYPE_SURFACE; current_shader = NULL; @@ -614,6 +641,8 @@ string OSLCompiler::id(ShaderNode *node) { /* assign layer unique name based on pointer address + bump mode */ stringstream stream; + stream.imbue(std::locale("C")); /* Ensure that no grouping characters (e.g. commas with en_US + locale) are added to the pointer string */ stream << "node_" << node->type->name << "_" << node; return stream.str(); @@ -1105,7 +1134,12 @@ OSL::ShaderGroupRef OSLCompiler::compile_type(Shader *shader, ShaderGraph *graph { current_type = type; - OSL::ShaderGroupRef group = ss->ShaderGroupBegin(shader->name.c_str()); + /* Use name hash to identify shader group to avoid issues with non-alphanumeric characters */ + stringstream name; + name.imbue(std::locale("C")); + name << "shader_" << shader->name.hash(); + + OSL::ShaderGroupRef group = ss->ShaderGroupBegin(name.str()); ShaderNode *output = graph->output(); ShaderNodeSet dependencies; diff --git a/intern/cycles/scene/osl.h b/intern/cycles/scene/osl.h index 76c6bd96ce1..c0e82a9dc8d 100644 --- a/intern/cycles/scene/osl.h +++ b/intern/cycles/scene/osl.h @@ -54,7 +54,7 @@ struct OSLShaderInfo { class OSLShaderManager : public ShaderManager { public: - OSLShaderManager(); + OSLShaderManager(Device *device); ~OSLShaderManager(); static void free_memory(); @@ -92,25 +92,22 @@ class OSLShaderManager : public ShaderManager { const std::string &bytecode_hash = "", const std::string &bytecode = ""); - protected: + private: void texture_system_init(); void texture_system_free(); void shading_system_init(); void shading_system_free(); - OSL::ShadingSystem *ss; - OSL::TextureSystem *ts; - OSLRenderServices *services; - OSL::ErrorHandler errhandler; + Device *device_; map<string, OSLShaderInfo> loaded_shaders; static OSL::TextureSystem *ts_shared; static thread_mutex ts_shared_mutex; static int ts_shared_users; - static OSL::ShadingSystem *ss_shared; - static OSLRenderServices *services_shared; + static OSL::ErrorHandler errhandler; + static map<int, OSL::ShadingSystem *> ss_shared; static thread_mutex ss_shared_mutex; static thread_mutex ss_mutex; static int ss_shared_users; @@ -123,10 +120,7 @@ class OSLShaderManager : public ShaderManager { class OSLCompiler { public: #ifdef WITH_OSL - OSLCompiler(OSLShaderManager *manager, - OSLRenderServices *services, - OSL::ShadingSystem *shadingsys, - Scene *scene); + OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *shadingsys, Scene *scene); #endif void compile(OSLGlobals *og, Shader *shader); diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp index 3a05bede7a3..d5be86e1db9 100644 --- a/intern/cycles/scene/scene.cpp +++ b/intern/cycles/scene/scene.cpp @@ -99,11 +99,8 @@ Scene::Scene(const SceneParams ¶ms_, Device *device) { memset((void *)&dscene.data, 0, sizeof(dscene.data)); - /* OSL only works on the CPU */ - if (device->info.has_osl) - shader_manager = ShaderManager::create(params.shadingsystem); - else - shader_manager = ShaderManager::create(SHADINGSYSTEM_SVM); + shader_manager = ShaderManager::create( + device->info.has_osl ? params.shadingsystem : SHADINGSYSTEM_SVM, device); light_manager = new LightManager(); geometry_manager = new GeometryManager(); diff --git a/intern/cycles/scene/shader.cpp b/intern/cycles/scene/shader.cpp index 56670c6e4e3..f176c19ec95 100644 --- a/intern/cycles/scene/shader.cpp +++ b/intern/cycles/scene/shader.cpp @@ -395,15 +395,16 @@ ShaderManager::~ShaderManager() { } -ShaderManager *ShaderManager::create(int shadingsystem) +ShaderManager *ShaderManager::create(int shadingsystem, Device *device) { ShaderManager *manager; (void)shadingsystem; /* Ignored when built without OSL. */ + (void)device; #ifdef WITH_OSL if (shadingsystem == SHADINGSYSTEM_OSL) { - manager = new OSLShaderManager(); + manager = new OSLShaderManager(device); } else #endif @@ -722,6 +723,10 @@ uint ShaderManager::get_kernel_features(Scene *scene) } } + if (use_osl()) { + kernel_features |= KERNEL_FEATURE_OSL; + } + return kernel_features; } diff --git a/intern/cycles/scene/shader.h b/intern/cycles/scene/shader.h index 2670776aca4..69b22d2ad19 100644 --- a/intern/cycles/scene/shader.h +++ b/intern/cycles/scene/shader.h @@ -170,7 +170,7 @@ class ShaderManager { UPDATE_NONE = 0u, }; - static ShaderManager *create(int shadingsystem); + static ShaderManager *create(int shadingsystem, Device *device); virtual ~ShaderManager(); virtual void reset(Scene *scene) = 0; diff --git a/intern/cycles/scene/shader_nodes.h b/intern/cycles/scene/shader_nodes.h index cc3a71a0697..a3a931bb0b3 100644 --- a/intern/cycles/scene/shader_nodes.h +++ b/intern/cycles/scene/shader_nodes.h @@ -1542,6 +1542,10 @@ class OSLNode final : public ShaderNode { { return true; } + virtual int get_feature() + { + return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_RAYTRACE; + } virtual bool equals(const ShaderNode & /*other*/) { diff --git a/intern/cycles/session/merge.h b/intern/cycles/session/merge.h index 702ca5c3eb8..d8a4f04a27b 100644 --- a/intern/cycles/session/merge.h +++ b/intern/cycles/session/merge.h @@ -9,7 +9,7 @@ CCL_NAMESPACE_BEGIN -/* Merge OpenEXR multilayer renders. */ +/* Merge OpenEXR multi-layer renders. */ class ImageMerger { public: diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt index c3ae81ed1db..34e5a4770ea 100644 --- a/intern/cycles/test/CMakeLists.txt +++ b/intern/cycles/test/CMakeLists.txt @@ -45,17 +45,24 @@ set(SRC # Disable AVX tests on macOS. Rosetta has problems running them, and other # platforms should be enough to verify AVX operations are implemented correctly. if(NOT APPLE) + if(CXX_HAS_SSE) + list(APPEND SRC + util_float8_sse2_test.cpp + ) + set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + endif() + if(CXX_HAS_AVX) list(APPEND SRC - util_avxf_avx_test.cpp + util_float8_avx_test.cpp ) - set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) list(APPEND SRC - util_avxf_avx2_test.cpp + util_float8_avx2_test.cpp ) - set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(util_float8_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() endif() diff --git a/intern/cycles/test/util_avxf_test.h b/intern/cycles/test/util_avxf_test.h deleted file mode 100644 index 34d966cc1a4..00000000000 --- a/intern/cycles/test/util_avxf_test.h +++ /dev/null @@ -1,211 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2022 Blender Foundation */ - -#include "testing/testing.h" -#include "util/system.h" -#include "util/types.h" - -CCL_NAMESPACE_BEGIN - -static bool validate_cpu_capabilities() -{ - -#ifdef __KERNEL_AVX2__ - return system_cpu_support_avx2(); -#else -# ifdef __KERNEL_AVX__ - return system_cpu_support_avx(); -# endif -#endif -} - -#define INIT_AVX_TEST \ - if (!validate_cpu_capabilities()) \ - return; \ -\ - const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \ - const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \ - const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f); - -#define compare_vector_scalar(a, b) \ - for (size_t index = 0; index < a.size; index++) \ - EXPECT_FLOAT_EQ(a[index], b); - -#define compare_vector_vector(a, b) \ - for (size_t index = 0; index < a.size; index++) \ - EXPECT_FLOAT_EQ(a[index], b[index]); - -#define compare_vector_vector_near(a, b, abserror) \ - for (size_t index = 0; index < a.size; index++) \ - EXPECT_NEAR(a[index], b[index], abserror); - -#define basic_test_vv(a, b, op) \ - INIT_AVX_TEST \ - avxf c = a op b; \ - for (size_t i = 0; i < a.size; i++) \ - EXPECT_FLOAT_EQ(c[i], a[i] op b[i]); - -/* vector op float tests */ -#define basic_test_vf(a, b, op) \ - INIT_AVX_TEST \ - avxf c = a op b; \ - for (size_t i = 0; i < a.size; i++) \ - EXPECT_FLOAT_EQ(c[i], a[i] op b); - -static const float float_b = 1.5f; - -TEST(TEST_CATEGORY_NAME, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(TEST_CATEGORY_NAME, - avxf_sub_vv){ - basic_test_vv(avxf_a, avxf_b, -)} TEST(TEST_CATEGORY_NAME, avxf_mul_vv){ - basic_test_vv(avxf_a, avxf_b, *)} TEST(TEST_CATEGORY_NAME, avxf_div_vv){ - basic_test_vv(avxf_a, avxf_b, /)} TEST(TEST_CATEGORY_NAME, avxf_add_vf){ - basic_test_vf(avxf_a, float_b, +)} TEST(TEST_CATEGORY_NAME, avxf_sub_vf){ - basic_test_vf(avxf_a, float_b, -)} TEST(TEST_CATEGORY_NAME, avxf_mul_vf){ - basic_test_vf(avxf_a, float_b, *)} TEST(TEST_CATEGORY_NAME, - avxf_div_vf){basic_test_vf(avxf_a, float_b, /)} - -TEST(TEST_CATEGORY_NAME, avxf_ctor) -{ - INIT_AVX_TEST - compare_vector_scalar(avxf(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f), - static_cast<float>(index)); - compare_vector_scalar(avxf(1.0f), 1.0f); - compare_vector_vector(avxf(1.0f, 2.0f), avxf(1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f)); - compare_vector_vector(avxf(1.0f, 2.0f, 3.0f, 4.0f), - avxf(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f)); - compare_vector_vector(avxf(make_float3(1.0f, 2.0f, 3.0f)), - avxf(0.0f, 3.0f, 2.0f, 1.0f, 0.0f, 3.0f, 2.0f, 1.0f)); -} - -TEST(TEST_CATEGORY_NAME, avxf_sqrt) -{ - INIT_AVX_TEST - compare_vector_vector(mm256_sqrt(avxf(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)), - avxf(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f)); -} - -TEST(TEST_CATEGORY_NAME, avxf_min_max) -{ - INIT_AVX_TEST - compare_vector_vector(min(avxf_a, avxf_b), avxf_a); - compare_vector_vector(max(avxf_a, avxf_b), avxf_b); -} - -TEST(TEST_CATEGORY_NAME, avxf_set_sign) -{ - INIT_AVX_TEST - avxf res = set_sign_bit<1, 0, 0, 0, 0, 0, 0, 0>(avxf_a); - compare_vector_vector(res, avxf(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, -0.8f)); -} - -TEST(TEST_CATEGORY_NAME, avxf_msub) -{ - INIT_AVX_TEST - avxf res = msub(avxf_a, avxf_b, avxf_c); - avxf exp = avxf((avxf_a[7] * avxf_b[7]) - avxf_c[7], - (avxf_a[6] * avxf_b[6]) - avxf_c[6], - (avxf_a[5] * avxf_b[5]) - avxf_c[5], - (avxf_a[4] * avxf_b[4]) - avxf_c[4], - (avxf_a[3] * avxf_b[3]) - avxf_c[3], - (avxf_a[2] * avxf_b[2]) - avxf_c[2], - (avxf_a[1] * avxf_b[1]) - avxf_c[1], - (avxf_a[0] * avxf_b[0]) - avxf_c[0]); - compare_vector_vector(res, exp); -} - -TEST(TEST_CATEGORY_NAME, avxf_madd) -{ - INIT_AVX_TEST - avxf res = madd(avxf_a, avxf_b, avxf_c); - avxf exp = avxf((avxf_a[7] * avxf_b[7]) + avxf_c[7], - (avxf_a[6] * avxf_b[6]) + avxf_c[6], - (avxf_a[5] * avxf_b[5]) + avxf_c[5], - (avxf_a[4] * avxf_b[4]) + avxf_c[4], - (avxf_a[3] * avxf_b[3]) + avxf_c[3], - (avxf_a[2] * avxf_b[2]) + avxf_c[2], - (avxf_a[1] * avxf_b[1]) + avxf_c[1], - (avxf_a[0] * avxf_b[0]) + avxf_c[0]); - compare_vector_vector(res, exp); -} - -TEST(TEST_CATEGORY_NAME, avxf_nmadd) -{ - INIT_AVX_TEST - avxf res = nmadd(avxf_a, avxf_b, avxf_c); - avxf exp = avxf(avxf_c[7] - (avxf_a[7] * avxf_b[7]), - avxf_c[6] - (avxf_a[6] * avxf_b[6]), - avxf_c[5] - (avxf_a[5] * avxf_b[5]), - avxf_c[4] - (avxf_a[4] * avxf_b[4]), - avxf_c[3] - (avxf_a[3] * avxf_b[3]), - avxf_c[2] - (avxf_a[2] * avxf_b[2]), - avxf_c[1] - (avxf_a[1] * avxf_b[1]), - avxf_c[0] - (avxf_a[0] * avxf_b[0])); - compare_vector_vector(res, exp); -} - -TEST(TEST_CATEGORY_NAME, avxf_compare) -{ - INIT_AVX_TEST - avxf a(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f); - avxf b(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f); - avxb res = a <= b; - int exp[8] = { - a[0] <= b[0] ? -1 : 0, - a[1] <= b[1] ? -1 : 0, - a[2] <= b[2] ? -1 : 0, - a[3] <= b[3] ? -1 : 0, - a[4] <= b[4] ? -1 : 0, - a[5] <= b[5] ? -1 : 0, - a[6] <= b[6] ? -1 : 0, - a[7] <= b[7] ? -1 : 0, - }; - compare_vector_vector(res, exp); -} - -TEST(TEST_CATEGORY_NAME, avxf_permute) -{ - INIT_AVX_TEST - avxf res = permute<3, 0, 1, 7, 6, 5, 2, 4>(avxf_b); - compare_vector_vector(res, avxf(4.0f, 6.0f, 3.0f, 2.0f, 1.0f, 7.0f, 8.0f, 5.0f)); -} - -TEST(TEST_CATEGORY_NAME, avxf_blend) -{ - INIT_AVX_TEST - avxf res = blend<0, 0, 1, 0, 1, 0, 1, 0>(avxf_a, avxf_b); - compare_vector_vector(res, avxf(0.1f, 0.2f, 3.0f, 0.4f, 5.0f, 0.6f, 7.0f, 0.8f)); -} - -TEST(TEST_CATEGORY_NAME, avxf_shuffle) -{ - INIT_AVX_TEST - avxf res = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(avxf_a); - compare_vector_vector(res, avxf(0.4f, 0.2f, 0.1f, 0.3f, 0.5f, 0.6f, 0.7f, 0.8f)); -} - -TEST(TEST_CATEGORY_NAME, avxf_cross) -{ - INIT_AVX_TEST - avxf res = cross(avxf_b, avxf_c); - compare_vector_vector_near(res, - avxf(0.0f, - -9.5367432e-07f, - 0.0f, - 4.7683716e-07f, - 0.0f, - -3.8146973e-06f, - 3.8146973e-06f, - 3.8146973e-06f), - 0.000002000f); -} - -TEST(TEST_CATEGORY_NAME, avxf_dot3) -{ - INIT_AVX_TEST - float den, den2; - dot3(avxf_a, avxf_b, den, den2); - EXPECT_FLOAT_EQ(den, 14.9f); - EXPECT_FLOAT_EQ(den2, 2.9f); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/test/util_avxf_avx2_test.cpp b/intern/cycles/test/util_float8_avx2_test.cpp index 992c4d9a913..4682dce5b23 100644 --- a/intern/cycles/test/util_avxf_avx2_test.cpp +++ b/intern/cycles/test/util_float8_avx2_test.cpp @@ -1,11 +1,13 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ +#define __KERNEL_SSE__ +#define __KERNEL_AVX__ #define __KERNEL_AVX2__ #define TEST_CATEGORY_NAME util_avx2 #if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \ defined(__AVX2__) -# include "util_avxf_test.h" +# include "util_float8_test.h" #endif diff --git a/intern/cycles/test/util_avxf_avx_test.cpp b/intern/cycles/test/util_float8_avx_test.cpp index abb98cdfb38..34fe750e766 100644 --- a/intern/cycles/test/util_avxf_avx_test.cpp +++ b/intern/cycles/test/util_float8_avx_test.cpp @@ -1,11 +1,12 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ +#define __KERNEL_SSE__ #define __KERNEL_AVX__ #define TEST_CATEGORY_NAME util_avx #if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \ defined(__AVX__) -# include "util_avxf_test.h" +# include "util_float8_test.h" #endif diff --git a/intern/cycles/test/util_float8_sse2_test.cpp b/intern/cycles/test/util_float8_sse2_test.cpp new file mode 100644 index 00000000000..ba8952a2b08 --- /dev/null +++ b/intern/cycles/test/util_float8_sse2_test.cpp @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#define __KERNEL_SSE__ +#define __KERNEL_SSE2__ + +#define TEST_CATEGORY_NAME util_sse2 + +#if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \ + defined(__SSE2__) +# include "util_float8_test.h" +#endif diff --git a/intern/cycles/test/util_float8_test.h b/intern/cycles/test/util_float8_test.h new file mode 100644 index 00000000000..54701afaf8b --- /dev/null +++ b/intern/cycles/test/util_float8_test.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#include "testing/testing.h" +#include "util/math.h" +#include "util/system.h" +#include "util/types.h" + +CCL_NAMESPACE_BEGIN + +static bool validate_cpu_capabilities() +{ + +#if defined(__KERNEL_AVX2__) + return system_cpu_support_avx2(); +#elif defined(__KERNEL_AVX__) + return system_cpu_support_avx(); +#elif defined(__KERNEL_SSE2__) + return system_cpu_support_sse2(); +#else + return false; +#endif +} + +#define INIT_FLOAT8_TEST \ + if (!validate_cpu_capabilities()) \ + return; \ +\ + const vfloat8 float8_a = make_vfloat8(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \ + const vfloat8 float8_b = make_vfloat8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \ + const vfloat8 float8_c = make_vfloat8(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f); + +#define compare_vector_scalar(a, b) \ + for (size_t index = 0; index < 8; index++) \ + EXPECT_FLOAT_EQ(a[index], b); + +#define compare_vector_vector(a, b) \ + for (size_t index = 0; index < 8; index++) \ + EXPECT_FLOAT_EQ(a[index], b[index]); + +#define compare_vector_vector_near(a, b, abserror) \ + for (size_t index = 0; index < 8; index++) \ + EXPECT_NEAR(a[index], b[index], abserror); + +#define basic_test_vv(a, b, op) \ + INIT_FLOAT8_TEST \ + vfloat8 c = a op b; \ + for (size_t i = 0; i < 8; i++) \ + EXPECT_FLOAT_EQ(c[i], a[i] op b[i]); + +/* vector op float tests */ +#define basic_test_vf(a, b, op) \ + INIT_FLOAT8_TEST \ + vfloat8 c = a op b; \ + for (size_t i = 0; i < 8; i++) \ + EXPECT_FLOAT_EQ(c[i], a[i] op b); + +static const float float_b = 1.5f; + +TEST(TEST_CATEGORY_NAME, + float8_add_vv){basic_test_vv(float8_a, float8_b, +)} TEST(TEST_CATEGORY_NAME, float8_sub_vv){ + basic_test_vv(float8_a, float8_b, -)} TEST(TEST_CATEGORY_NAME, float8_mul_vv){ + basic_test_vv(float8_a, float8_b, *)} TEST(TEST_CATEGORY_NAME, float8_div_vv){ + basic_test_vv(float8_a, float8_b, /)} TEST(TEST_CATEGORY_NAME, float8_add_vf){ + basic_test_vf(float8_a, float_b, +)} TEST(TEST_CATEGORY_NAME, float8_sub_vf){ + basic_test_vf(float8_a, float_b, -)} TEST(TEST_CATEGORY_NAME, float8_mul_vf){ + basic_test_vf(float8_a, float_b, *)} TEST(TEST_CATEGORY_NAME, + float8_div_vf){basic_test_vf(float8_a, float_b, /)} + +TEST(TEST_CATEGORY_NAME, float8_ctor) +{ + INIT_FLOAT8_TEST + compare_vector_scalar(make_vfloat8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f), + static_cast<float>(index)); + compare_vector_scalar(make_vfloat8(1.0f), 1.0f); +} + +TEST(TEST_CATEGORY_NAME, float8_sqrt) +{ + INIT_FLOAT8_TEST + compare_vector_vector(sqrt(make_vfloat8(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)), + make_vfloat8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f)); +} + +TEST(TEST_CATEGORY_NAME, float8_min_max) +{ + INIT_FLOAT8_TEST + compare_vector_vector(min(float8_a, float8_b), float8_a); + compare_vector_vector(max(float8_a, float8_b), float8_b); +} + +TEST(TEST_CATEGORY_NAME, float8_shuffle) +{ + INIT_FLOAT8_TEST + vfloat8 res0 = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(float8_a); + compare_vector_vector(res0, make_vfloat8(0.1f, 0.2f, 0.3f, 0.4f, 0.6f, 0.8f, 0.7f, 0.5f)); + vfloat8 res1 = shuffle<3>(float8_a); + compare_vector_vector(res1, make_vfloat8(0.4f, 0.4f, 0.4f, 0.4f, 0.8f, 0.8f, 0.8f, 0.8f)); + vfloat8 res2 = shuffle<3, 2, 1, 0>(float8_a, float8_b); + compare_vector_vector(res2, make_vfloat8(0.4f, 0.3f, 2.0f, 1.0f, 0.8f, 0.7f, 6.0f, 5.0f)); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 57628f99e35..7f8f4a5ce76 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -69,6 +69,7 @@ set(SRC_HEADERS math_int2.h math_int3.h math_int4.h + math_int8.h math_matrix.h md5.h murmurhash.h @@ -85,13 +86,7 @@ set(SRC_HEADERS rect.h set.h simd.h - avxf.h - avxb.h - avxi.h semaphore.h - sseb.h - ssef.h - ssei.h stack_allocator.h static_assert.h stats.h @@ -118,6 +113,8 @@ set(SRC_HEADERS types_int3_impl.h types_int4.h types_int4_impl.h + types_int8.h + types_int8_impl.h types_spectrum.h types_uchar2.h types_uchar2_impl.h diff --git a/intern/cycles/util/avxb.h b/intern/cycles/util/avxb.h deleted file mode 100644 index fa3cb565309..00000000000 --- a/intern/cycles/util/avxb.h +++ /dev/null @@ -1,230 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2013 Intel Corporation - * Modifications Copyright 2014-2022 Blender Foundation. */ - -#ifndef __UTIL_AVXB_H__ -#define __UTIL_AVXB_H__ - -CCL_NAMESPACE_BEGIN - -struct avxf; - -/*! 4-wide SSE bool type. */ -struct avxb { - typedef avxb Mask; // mask type - typedef avxf Float; // float type - - enum { size = 8 }; // number of SIMD elements - union { - __m256 m256; - int32_t v[8]; - }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline avxb() - { - } - __forceinline avxb(const avxb &other) - { - m256 = other.m256; - } - __forceinline avxb &operator=(const avxb &other) - { - m256 = other.m256; - return *this; - } - - __forceinline avxb(const __m256 input) : m256(input) - { - } - __forceinline avxb(const __m128 &a, const __m128 &b) - : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1)) - { - } - __forceinline operator const __m256 &(void) const - { - return m256; - } - __forceinline operator const __m256i(void) const - { - return _mm256_castps_si256(m256); - } - __forceinline operator const __m256d(void) const - { - return _mm256_castps_pd(m256); - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline avxb(FalseTy) : m256(_mm256_setzero_ps()) - { - } - __forceinline avxb(TrueTy) : m256(_mm256_castsi256_ps(_mm256_set1_epi32(-1))) - { - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator[](const size_t i) const - { - assert(i < 8); - return (_mm256_movemask_ps(m256) >> i) & 1; - } - __forceinline int32_t &operator[](const size_t i) - { - assert(i < 8); - return v[i]; - } -}; - -//////////////////////////////////////////////////////////////////////////////// -/// Unary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const avxb operator!(const avxb &a) -{ - return _mm256_xor_ps(a, avxb(True)); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Binary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const avxb operator&(const avxb &a, const avxb &b) -{ - return _mm256_and_ps(a, b); -} -__forceinline const avxb operator|(const avxb &a, const avxb &b) -{ - return _mm256_or_ps(a, b); -} -__forceinline const avxb operator^(const avxb &a, const avxb &b) -{ - return _mm256_xor_ps(a, b); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Assignment Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const avxb operator&=(avxb &a, const avxb &b) -{ - return a = a & b; -} -__forceinline const avxb operator|=(avxb &a, const avxb &b) -{ - return a = a | b; -} -__forceinline const avxb operator^=(avxb &a, const avxb &b) -{ - return a = a ^ b; -} - -//////////////////////////////////////////////////////////////////////////////// -/// Comparison Operators + Select -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const avxb operator!=(const avxb &a, const avxb &b) -{ - return _mm256_xor_ps(a, b); -} -__forceinline const avxb operator==(const avxb &a, const avxb &b) -{ -#ifdef __KERNEL_AVX2__ - return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); -#else - __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0)); - __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1)); - __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0)); - __m128i b_hi = _mm_castps_si128(_mm256_extractf128_ps(b, 1)); - __m128i c_lo = _mm_cmpeq_epi32(a_lo, b_lo); - __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi); - __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1); - return _mm256_castsi256_ps(result); -#endif -} - -__forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f) -{ -#if defined(__KERNEL_SSE41__) - return _mm256_blendv_ps(f, t, m); -#else - return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f)); -#endif -} - -//////////////////////////////////////////////////////////////////////////////// -/// Movement/Shifting/Shuffling Functions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const avxb unpacklo(const avxb &a, const avxb &b) -{ - return _mm256_unpacklo_ps(a, b); -} -__forceinline const avxb unpackhi(const avxb &a, const avxb &b) -{ - return _mm256_unpackhi_ps(a, b); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Reduction Operations -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__KERNEL_SSE41__) -__forceinline uint32_t popcnt(const avxb &a) -{ - return _mm_popcnt_u32(_mm256_movemask_ps(a)); -} -#else -__forceinline uint32_t popcnt(const avxb &a) -{ - return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) + - bool(a[7]); -} -#endif - -__forceinline bool reduce_and(const avxb &a) -{ - return _mm256_movemask_ps(a) == 0xf; -} -__forceinline bool reduce_or(const avxb &a) -{ - return _mm256_movemask_ps(a) != 0x0; -} -__forceinline bool all(const avxb &b) -{ - return _mm256_movemask_ps(b) == 0xf; -} -__forceinline bool any(const avxb &b) -{ - return _mm256_movemask_ps(b) != 0x0; -} -__forceinline bool none(const avxb &b) -{ - return _mm256_movemask_ps(b) == 0x0; -} - -__forceinline uint32_t movemask(const avxb &a) -{ - return _mm256_movemask_ps(a); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Debug Functions -//////////////////////////////////////////////////////////////////////////////// - -ccl_device_inline void print_avxb(const char *label, const avxb &a) -{ - printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]); -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/util/avxf.h b/intern/cycles/util/avxf.h deleted file mode 100644 index 03a13f30490..00000000000 --- a/intern/cycles/util/avxf.h +++ /dev/null @@ -1,379 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2016 Intel Corporation */ - -#ifndef __UTIL_AVXF_H__ -#define __UTIL_AVXF_H__ - -CCL_NAMESPACE_BEGIN - -struct avxb; - -struct avxf { - typedef avxf Float; - - enum { size = 8 }; /* Number of SIMD elements. */ - - union { - __m256 m256; - float f[8]; - int i[8]; - }; - - __forceinline avxf() - { - } - __forceinline avxf(const avxf &other) - { - m256 = other.m256; - } - __forceinline avxf &operator=(const avxf &other) - { - m256 = other.m256; - return *this; - } - - __forceinline avxf(const __m256 a) : m256(a) - { - } - __forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps(a)) - { - } - - __forceinline operator const __m256 &() const - { - return m256; - } - __forceinline operator __m256 &() - { - return m256; - } - - __forceinline avxf(float a) : m256(_mm256_set1_ps(a)) - { - } - - __forceinline avxf(float high32x4, float low32x4) - : m256(_mm256_set_ps( - high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4)) - { - } - - __forceinline avxf(float a3, float a2, float a1, float a0) - : m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0)) - { - } - - __forceinline avxf( - float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) - : m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) - { - } - - __forceinline avxf(float3 a) : m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x)) - { - } - - __forceinline avxf(int a3, int a2, int a1, int a0) - { - const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0); - m256 = _mm256_castsi256_ps(foo); - } - - __forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0) - { - const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0); - m256 = _mm256_castsi256_ps(foo); - } - - __forceinline avxf(__m128 a, __m128 b) - { - const __m256 foo = _mm256_castps128_ps256(a); - m256 = _mm256_insertf128_ps(foo, b, 1); - } - - __forceinline const float &operator[](const size_t i) const - { - assert(i < 8); - return f[i]; - } - __forceinline float &operator[](const size_t i) - { - assert(i < 8); - return f[i]; - } -}; - -__forceinline avxf cross(const avxf &a, const avxf &b) -{ - avxf r(0.0, - a[4] * b[5] - a[5] * b[4], - a[6] * b[4] - a[4] * b[6], - a[5] * b[6] - a[6] * b[5], - 0.0, - a[0] * b[1] - a[1] * b[0], - a[2] * b[0] - a[0] * b[2], - a[1] * b[2] - a[2] * b[1]); - return r; -} - -__forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2) -{ - const avxf t = _mm256_mul_ps(a.m256, b.m256); - den = ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2]; - den2 = ((float *)&t)[4] + ((float *)&t)[5] + ((float *)&t)[6]; -} - -//////////////////////////////////////////////////////////////////////////////// -/// Unary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const avxf cast(const __m256i &a) -{ - return _mm256_castsi256_ps(a); -} - -__forceinline const avxf mm256_sqrt(const avxf &a) -{ - return _mm256_sqrt_ps(a.m256); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Binary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const avxf operator+(const avxf &a, const avxf &b) -{ - return _mm256_add_ps(a.m256, b.m256); -} -__forceinline const avxf operator+(const avxf &a, const float &b) -{ - return a + avxf(b); -} -__forceinline const avxf operator+(const float &a, const avxf &b) -{ - return avxf(a) + b; -} - -__forceinline const avxf operator-(const avxf &a, const avxf &b) -{ - return _mm256_sub_ps(a.m256, b.m256); -} -__forceinline const avxf operator-(const avxf &a, const float &b) -{ - return a - avxf(b); -} -__forceinline const avxf operator-(const float &a, const avxf &b) -{ - return avxf(a) - b; -} - -__forceinline const avxf operator*(const avxf &a, const avxf &b) -{ - return _mm256_mul_ps(a.m256, b.m256); -} -__forceinline const avxf operator*(const avxf &a, const float &b) -{ - return a * avxf(b); -} -__forceinline const avxf operator*(const float &a, const avxf &b) -{ - return avxf(a) * b; -} - -__forceinline const avxf operator/(const avxf &a, const avxf &b) -{ - return _mm256_div_ps(a.m256, b.m256); -} -__forceinline const avxf operator/(const avxf &a, const float &b) -{ - return a / avxf(b); -} -__forceinline const avxf operator/(const float &a, const avxf &b) -{ - return avxf(a) / b; -} - -__forceinline const avxf operator|(const avxf &a, const avxf &b) -{ - return _mm256_or_ps(a.m256, b.m256); -} - -__forceinline const avxf operator^(const avxf &a, const avxf &b) -{ - return _mm256_xor_ps(a.m256, b.m256); -} - -__forceinline const avxf operator&(const avxf &a, const avxf &b) -{ - return _mm256_and_ps(a.m256, b.m256); -} - -__forceinline const avxf max(const avxf &a, const avxf &b) -{ - return _mm256_max_ps(a.m256, b.m256); -} -__forceinline const avxf min(const avxf &a, const avxf &b) -{ - return _mm256_min_ps(a.m256, b.m256); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Movement/Shifting/Shuffling Functions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const avxf shuffle(const avxf &a, const __m256i &shuf) -{ - return _mm256_permutevar_ps(a, shuf); -} - -template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> -__forceinline const avxf shuffle(const avxf &a) -{ - return _mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0)); -} - -template<size_t i0, size_t i1, size_t i2, size_t i3> -__forceinline const avxf shuffle(const avxf &a, const avxf &b) -{ - return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); -} -template<size_t i0, size_t i1, size_t i2, size_t i3> -__forceinline const avxf shuffle(const avxf &a) -{ - return shuffle<i0, i1, i2, i3>(a, a); -} -template<size_t i0> __forceinline const avxf shuffle(const avxf &a, const avxf &b) -{ - return shuffle<i0, i0, i0, i0>(a, b); -} -template<size_t i0> __forceinline const avxf shuffle(const avxf &a) -{ - return shuffle<i0>(a, a); -} - -template<size_t i> __forceinline float extract(const avxf &a) -{ - __m256 b = shuffle<i, i, i, i>(a).m256; - return _mm256_cvtss_f32(b); -} -template<> __forceinline float extract<0>(const avxf &a) -{ - return _mm256_cvtss_f32(a.m256); -} - -__forceinline ssef low(const avxf &a) -{ - return _mm256_extractf128_ps(a.m256, 0); -} -__forceinline ssef high(const avxf &a) -{ - return _mm256_extractf128_ps(a.m256, 1); -} - -template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> -__forceinline const avxf permute(const avxf &a) -{ -#ifdef __KERNEL_AVX2__ - return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0)); -#else - float temp[8]; - _mm256_storeu_ps((float *)&temp, a); - return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]); -#endif -} - -template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7> -ccl_device_inline const avxf set_sign_bit(const avxf &a) -{ - return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31, S2 << 31, S1 << 31, S0 << 31); -} - -template<size_t S0, size_t S1, size_t S2, size_t S3, size_t S4, size_t S5, size_t S6, size_t S7> -ccl_device_inline const avxf blend(const avxf &a, const avxf &b) -{ - return _mm256_blend_ps( - a, b, S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7); -} - -template<size_t S0, size_t S1, size_t S2, size_t S3> -ccl_device_inline const avxf blend(const avxf &a, const avxf &b) -{ - return blend<S0, S1, S2, S3, S0, S1, S2, S3>(a, b); -} - -//#if defined(__KERNEL_SSE41__) -__forceinline avxf maxi(const avxf &a, const avxf &b) -{ - const avxf ci = _mm256_max_ps(a, b); - return ci; -} - -__forceinline avxf mini(const avxf &a, const avxf &b) -{ - const avxf ci = _mm256_min_ps(a, b); - return ci; -} -//#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Ternary Operators -//////////////////////////////////////////////////////////////////////////////// -__forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c) -{ -#ifdef __KERNEL_AVX2__ - return _mm256_fmadd_ps(a, b, c); -#else - return c + (a * b); -#endif -} - -__forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c) -{ -#ifdef __KERNEL_AVX2__ - return _mm256_fnmadd_ps(a, b, c); -#else - return c - (a * b); -#endif -} -__forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c) -{ -#ifdef __KERNEL_AVX2__ - return _mm256_fmsub_ps(a, b, c); -#else - return (a * b) - c; -#endif -} - -//////////////////////////////////////////////////////////////////////////////// -/// Comparison Operators + Select -//////////////////////////////////////////////////////////////////////////////// -__forceinline const avxb operator<=(const avxf &a, const avxf &b) -{ - return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS); -} - -__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f) -{ - return _mm256_blendv_ps(f, t, m); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Common Functions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t) -{ - return madd(t, b, (avxf(1.0f) - t) * a); -} - -#ifndef _mm256_set_m128 -# define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \ - _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1) -#endif - -#define _mm256_loadu2_m128(/* float const* */ hiaddr, /* float const* */ loaddr) \ - _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr)) - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/util/avxi.h b/intern/cycles/util/avxi.h deleted file mode 100644 index 966a04a6b97..00000000000 --- a/intern/cycles/util/avxi.h +++ /dev/null @@ -1,732 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2009-2013 Intel Corporation */ - -#ifndef __UTIL_AVXI_H__ -#define __UTIL_AVXI_H__ - -CCL_NAMESPACE_BEGIN - -struct avxb; - -struct avxi { - typedef avxb Mask; // mask type for us - enum { size = 8 }; // number of SIMD elements - union { // data - __m256i m256; -#if !defined(__KERNEL_AVX2__) - struct { - __m128i l, h; - }; -#endif - int32_t v[8]; - }; - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline avxi() - { - } - __forceinline avxi(const avxi &a) - { - m256 = a.m256; - } - __forceinline avxi &operator=(const avxi &a) - { - m256 = a.m256; - return *this; - } - - __forceinline avxi(const __m256i a) : m256(a) - { - } - __forceinline operator const __m256i &(void) const - { - return m256; - } - __forceinline operator __m256i &(void) - { - return m256; - } - - __forceinline explicit avxi(const ssei &a) - : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1)) - { - } - __forceinline avxi(const ssei &a, const ssei &b) - : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1)) - { - } -#if defined(__KERNEL_AVX2__) - __forceinline avxi(const __m128i &a, const __m128i &b) - : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1)) - { - } -#else - __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b) - { - } -#endif - __forceinline explicit avxi(const int32_t *const a) - : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a))) - { - } - __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a)) - { - } - __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a)) - { - } - __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d) - : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a)) - { - } - __forceinline avxi( - int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h) - : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a)) - { - } - - __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a)) - { - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256()) - { - } -#if defined(__KERNEL_AVX2__) - __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1)) - { - } - __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf)) - { - } - __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf)) - { - } -#else - __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1)) - { - } - __forceinline avxi(PosInfTy) - : m256(_mm256_set_epi32( - pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf)) - { - } - __forceinline avxi(NegInfTy) - : m256(_mm256_set_epi32( - neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf)) - { - } -#endif - __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) - { - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int32_t &operator[](const size_t i) const - { - assert(i < 8); - return v[i]; - } - __forceinline int32_t &operator[](const size_t i) - { - assert(i < 8); - return v[i]; - } -}; - -//////////////////////////////////////////////////////////////////////////////// -/// Unary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const avxi cast(const __m256 &a) -{ - return _mm256_castps_si256(a); -} -__forceinline const avxi operator+(const avxi &a) -{ - return a; -} -#if defined(__KERNEL_AVX2__) -__forceinline const avxi operator-(const avxi &a) -{ - return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256); -} -__forceinline const avxi abs(const avxi &a) -{ - return _mm256_abs_epi32(a.m256); -} -#else -__forceinline const avxi operator-(const avxi &a) -{ - return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h)); -} -__forceinline const avxi abs(const avxi &a) -{ - return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h)); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -/// Binary Operators -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__KERNEL_AVX2__) -__forceinline const avxi operator+(const avxi &a, const avxi &b) -{ - return _mm256_add_epi32(a.m256, b.m256); -} -#else -__forceinline const avxi operator+(const avxi &a, const avxi &b) -{ - return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h)); -} -#endif -__forceinline const avxi operator+(const avxi &a, const int32_t b) -{ - return a + avxi(b); -} -__forceinline const avxi operator+(const int32_t a, const avxi &b) -{ - return avxi(a) + b; -} - -#if defined(__KERNEL_AVX2__) -__forceinline const avxi operator-(const avxi &a, const avxi &b) -{ - return _mm256_sub_epi32(a.m256, b.m256); -} -#else -__forceinline const avxi operator-(const avxi &a, const avxi &b) -{ - return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h)); -} -#endif -__forceinline const avxi operator-(const avxi &a, const int32_t b) -{ - return a - avxi(b); -} -__forceinline const avxi operator-(const int32_t a, const avxi &b) -{ - return avxi(a) - b; -} - -#if defined(__KERNEL_AVX2__) -__forceinline const avxi operator*(const avxi &a, const avxi &b) -{ - return _mm256_mullo_epi32(a.m256, b.m256); -} -#else -__forceinline const avxi operator*(const avxi &a, const avxi &b) -{ - return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h)); -} -#endif -__forceinline const avxi operator*(const avxi &a, const int32_t b) -{ - return a * avxi(b); -} -__forceinline const avxi operator*(const int32_t a, const avxi &b) -{ - return avxi(a) * b; -} - -#if defined(__KERNEL_AVX2__) -__forceinline const avxi operator&(const avxi &a, const avxi &b) -{ - return _mm256_and_si256(a.m256, b.m256); -} -#else -__forceinline const avxi operator&(const avxi &a, const avxi &b) -{ - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); -} -#endif -__forceinline const avxi operator&(const avxi &a, const int32_t b) -{ - return a & avxi(b); -} -__forceinline const avxi operator&(const int32_t a, const avxi &b) -{ - return avxi(a) & b; -} - -#if defined(__KERNEL_AVX2__) -__forceinline const avxi operator|(const avxi &a, const avxi &b) -{ - return _mm256_or_si256(a.m256, b.m256); -} -#else -__forceinline const avxi operator|(const avxi &a, const avxi &b) -{ - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); -} -#endif -__forceinline const avxi operator|(const avxi &a, const int32_t b) -{ - return a | avxi(b); -} -__forceinline const avxi operator|(const int32_t a, const avxi &b) -{ - return avxi(a) | b; -} - -#if defined(__KERNEL_AVX2__) -__forceinline const avxi operator^(const avxi &a, const avxi &b) -{ - return _mm256_xor_si256(a.m256, b.m256); -} -#else -__forceinline const avxi operator^(const avxi &a, const avxi &b) -{ - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); -} -#endif -__forceinline const avxi operator^(const avxi &a, const int32_t b) -{ - return a ^ avxi(b); -} -__forceinline const avxi operator^(const int32_t a, const avxi &b) -{ - return avxi(a) ^ b; -} - -#if defined(__KERNEL_AVX2__) -__forceinline const avxi operator<<(const avxi &a, const int32_t n) -{ - return _mm256_slli_epi32(a.m256, n); -} -__forceinline const avxi operator>>(const avxi &a, const int32_t n) -{ - return _mm256_srai_epi32(a.m256, n); -} - -__forceinline const avxi sra(const avxi &a, const int32_t b) -{ - return _mm256_srai_epi32(a.m256, b); -} -__forceinline const avxi srl(const avxi &a, const int32_t b) -{ - return _mm256_srli_epi32(a.m256, b); -} -#else -__forceinline const avxi operator<<(const avxi &a, const int32_t n) -{ - return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n)); -} -__forceinline const avxi operator>>(const avxi &a, const int32_t n) -{ - return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n)); -} - -__forceinline const avxi sra(const avxi &a, const int32_t b) -{ - return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b)); -} -__forceinline const avxi srl(const avxi &a, const int32_t b) -{ - return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b)); -} -#endif - -#if defined(__KERNEL_AVX2__) -__forceinline const avxi min(const avxi &a, const avxi &b) -{ - return _mm256_min_epi32(a.m256, b.m256); -} -#else -__forceinline const avxi min(const avxi &a, const avxi &b) -{ - return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h)); -} -#endif -__forceinline const avxi min(const avxi &a, const int32_t b) -{ - return min(a, avxi(b)); -} -__forceinline const avxi min(const int32_t a, const avxi &b) -{ - return min(avxi(a), b); -} - -#if defined(__KERNEL_AVX2__) -__forceinline const avxi max(const avxi &a, const avxi &b) -{ - return _mm256_max_epi32(a.m256, b.m256); -} -#else -__forceinline const avxi max(const avxi &a, const avxi &b) -{ - return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h)); -} -#endif -__forceinline const avxi max(const avxi &a, const int32_t b) -{ - return max(a, avxi(b)); -} -__forceinline const avxi max(const int32_t a, const avxi &b) -{ - return max(avxi(a), b); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Assignment Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline avxi &operator+=(avxi &a, const avxi &b) -{ - return a = a + b; -} -__forceinline avxi &operator+=(avxi &a, const int32_t b) -{ - return a = a + b; -} - -__forceinline avxi &operator-=(avxi &a, const avxi &b) -{ - return a = a - b; -} -__forceinline avxi &operator-=(avxi &a, const int32_t b) -{ - return a = a - b; -} - -__forceinline avxi &operator*=(avxi &a, const avxi &b) -{ - return a = a * b; -} -__forceinline avxi &operator*=(avxi &a, const int32_t b) -{ - return a = a * b; -} - -__forceinline avxi &operator&=(avxi &a, const avxi &b) -{ - return a = a & b; -} -__forceinline avxi &operator&=(avxi &a, const int32_t b) -{ - return a = a & b; -} - -__forceinline avxi &operator|=(avxi &a, const avxi &b) -{ - return a = a | b; -} -__forceinline avxi &operator|=(avxi &a, const int32_t b) -{ - return a = a | b; -} - -__forceinline avxi &operator^=(avxi &a, const avxi &b) -{ - return a = a ^ b; -} -__forceinline avxi &operator^=(avxi &a, const int32_t b) -{ - return a = a ^ b; -} - -__forceinline avxi &operator<<=(avxi &a, const int32_t b) -{ - return a = a << b; -} -__forceinline avxi &operator>>=(avxi &a, const int32_t b) -{ - return a = a >> b; -} - -//////////////////////////////////////////////////////////////////////////////// -/// Comparison Operators + Select -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__KERNEL_AVX2__) -__forceinline const avxb operator==(const avxi &a, const avxi &b) -{ - return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256)); -} -#else -__forceinline const avxb operator==(const avxi &a, const avxi &b) -{ - return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)), - _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h))); -} -#endif -__forceinline const avxb operator==(const avxi &a, const int32_t b) -{ - return a == avxi(b); -} -__forceinline const avxb operator==(const int32_t a, const avxi &b) -{ - return avxi(a) == b; -} - -__forceinline const avxb operator!=(const avxi &a, const avxi &b) -{ - return !(a == b); -} -__forceinline const avxb operator!=(const avxi &a, const int32_t b) -{ - return a != avxi(b); -} -__forceinline const avxb operator!=(const int32_t a, const avxi &b) -{ - return avxi(a) != b; -} - -#if defined(__KERNEL_AVX2__) -__forceinline const avxb operator<(const avxi &a, const avxi &b) -{ - return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256)); -} -#else -__forceinline const avxb operator<(const avxi &a, const avxi &b) -{ - return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)), - _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h))); -} -#endif -__forceinline const avxb operator<(const avxi &a, const int32_t b) -{ - return a < avxi(b); -} -__forceinline const avxb operator<(const int32_t a, const avxi &b) -{ - return avxi(a) < b; -} - -__forceinline const avxb operator>=(const avxi &a, const avxi &b) -{ - return !(a < b); -} -__forceinline const avxb operator>=(const avxi &a, const int32_t b) -{ - return a >= avxi(b); -} -__forceinline const avxb operator>=(const int32_t a, const avxi &b) -{ - return avxi(a) >= b; -} - -#if defined(__KERNEL_AVX2__) -__forceinline const avxb operator>(const avxi &a, const avxi &b) -{ - return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256)); -} -#else -__forceinline const avxb operator>(const avxi &a, const avxi &b) -{ - return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)), - _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h))); -} -#endif -__forceinline const avxb operator>(const avxi &a, const int32_t b) -{ - return a > avxi(b); -} -__forceinline const avxb operator>(const int32_t a, const avxi &b) -{ - return avxi(a) > b; -} - -__forceinline const avxb operator<=(const avxi &a, const avxi &b) -{ - return !(a > b); -} -__forceinline const avxb operator<=(const avxi &a, const int32_t b) -{ - return a <= avxi(b); -} -__forceinline const avxb operator<=(const int32_t a, const avxi &b) -{ - return avxi(a) <= b; -} - -__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f) -{ - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Movement/Shifting/Shuffling Functions -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__KERNEL_AVX2__) -__forceinline avxi unpacklo(const avxi &a, const avxi &b) -{ - return _mm256_unpacklo_epi32(a.m256, b.m256); -} -__forceinline avxi unpackhi(const avxi &a, const avxi &b) -{ - return _mm256_unpackhi_epi32(a.m256, b.m256); -} -#else -__forceinline avxi unpacklo(const avxi &a, const avxi &b) -{ - return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); -} -__forceinline avxi unpackhi(const avxi &a, const avxi &b) -{ - return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); -} -#endif - -template<size_t i> __forceinline const avxi shuffle(const avxi &a) -{ - return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i))); -} - -template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a) -{ - return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0)); -} - -template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b) -{ - return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); -} - -template<size_t i0, size_t i1, size_t i2, size_t i3> -__forceinline const avxi shuffle(const avxi &a) -{ - return _mm256_castps_si256( - _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0))); -} - -template<size_t i0, size_t i1, size_t i2, size_t i3> -__forceinline const avxi shuffle(const avxi &a, const avxi &b) -{ - return _mm256_castps_si256(_mm256_shuffle_ps( - _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); -} - -template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b) -{ - return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b))); -} -template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b) -{ - return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b))); -} -template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b) -{ - return _mm256_castps_si256( - _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b))))); -} - -__forceinline const avxi broadcast(const int *ptr) -{ - return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr)); -} -template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b) -{ - return _mm256_insertf128_si256(a, b, i); -} -template<size_t i> __forceinline const ssei extract(const avxi &a) -{ - return _mm256_extractf128_si256(a, i); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Reductions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const avxi vreduce_min2(const avxi &v) -{ - return min(v, shuffle<1, 0, 3, 2>(v)); -} -__forceinline const avxi vreduce_min4(const avxi &v) -{ - avxi v1 = vreduce_min2(v); - return min(v1, shuffle<2, 3, 0, 1>(v1)); -} -__forceinline const avxi vreduce_min(const avxi &v) -{ - avxi v1 = vreduce_min4(v); - return min(v1, shuffle<1, 0>(v1)); -} - -__forceinline const avxi vreduce_max2(const avxi &v) -{ - return max(v, shuffle<1, 0, 3, 2>(v)); -} -__forceinline const avxi vreduce_max4(const avxi &v) -{ - avxi v1 = vreduce_max2(v); - return max(v1, shuffle<2, 3, 0, 1>(v1)); -} -__forceinline const avxi vreduce_max(const avxi &v) -{ - avxi v1 = vreduce_max4(v); - return max(v1, shuffle<1, 0>(v1)); -} - -__forceinline const avxi vreduce_add2(const avxi &v) -{ - return v + shuffle<1, 0, 3, 2>(v); -} -__forceinline const avxi vreduce_add4(const avxi &v) -{ - avxi v1 = vreduce_add2(v); - return v1 + shuffle<2, 3, 0, 1>(v1); -} -__forceinline const avxi vreduce_add(const avxi &v) -{ - avxi v1 = vreduce_add4(v); - return v1 + shuffle<1, 0>(v1); -} - -__forceinline int reduce_min(const avxi &v) -{ - return extract<0>(extract<0>(vreduce_min(v))); -} -__forceinline int reduce_max(const avxi &v) -{ - return extract<0>(extract<0>(vreduce_max(v))); -} -__forceinline int reduce_add(const avxi &v) -{ - return extract<0>(extract<0>(vreduce_add(v))); -} - -__forceinline uint32_t select_min(const avxi &v) -{ - return __bsf(movemask(v == vreduce_min(v))); -} -__forceinline uint32_t select_max(const avxi &v) -{ - return __bsf(movemask(v == vreduce_max(v))); -} - -__forceinline uint32_t select_min(const avxb &valid, const avxi &v) -{ - const avxi a = select(valid, v, avxi(pos_inf)); - return __bsf(movemask(valid & (a == vreduce_min(a)))); -} -__forceinline uint32_t select_max(const avxb &valid, const avxi &v) -{ - const avxi a = select(valid, v, avxi(neg_inf)); - return __bsf(movemask(valid & (a == vreduce_max(a)))); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Output Operators -//////////////////////////////////////////////////////////////////////////////// - -ccl_device_inline void print_avxi(const char *label, const avxi &a) -{ - printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]); -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/util/color.h b/intern/cycles/util/color.h index 537f8ab6771..93e984120f2 100644 --- a/intern/cycles/util/color.h +++ b/intern/cycles/util/color.h @@ -228,28 +228,27 @@ ccl_device float3 xyY_to_xyz(float x, float y, float Y) * exp = exponent, encoded as uint32_t * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t */ -template<unsigned exp, unsigned e2coeff> ccl_device_inline ssef fastpow(const ssef &arg) +template<unsigned exp, unsigned e2coeff> ccl_device_inline float4 fastpow(const float4 &arg) { - ssef ret; - ret = arg * cast(ssei(e2coeff)); - ret = ssef(cast(ret)); - ret = ret * cast(ssei(exp)); - ret = cast(ssei(ret)); + float4 ret = arg * cast(make_int4(e2coeff)); + ret = make_float4(cast(ret)); + ret = ret * cast(make_int4(exp)); + ret = cast(make_int4(ret)); return ret; } /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */ -ccl_device_inline ssef improve_5throot_solution(const ssef &old_result, const ssef &x) +ccl_device_inline float4 improve_5throot_solution(const float4 &old_result, const float4 &x) { - ssef approx2 = old_result * old_result; - ssef approx4 = approx2 * approx2; - ssef t = x / approx4; - ssef summ = madd(ssef(4.0f), old_result, t); - return summ * ssef(1.0f / 5.0f); + float4 approx2 = old_result * old_result; + float4 approx4 = approx2 * approx2; + float4 t = x / approx4; + float4 summ = madd(make_float4(4.0f), old_result, t); + return summ * make_float4(1.0f / 5.0f); } /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */ -ccl_device_inline ssef fastpow24(const ssef &arg) +ccl_device_inline float4 fastpow24(const float4 &arg) { /* max, avg and |avg| errors were calculated in gcc without FMA instructions * The final precision should be better than powf in glibc */ @@ -257,9 +256,10 @@ ccl_device_inline ssef fastpow24(const ssef &arg) /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */ /* 0x3F4CCCCD = 4/5 */ /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */ - ssef x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg); // error max = 0.17 avg = 0.0018 |avg| = 0.05 - ssef arg2 = arg * arg; - ssef arg4 = arg2 * arg2; + float4 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>( + arg); // error max = 0.17 avg = 0.0018 |avg| = 0.05 + float4 arg2 = arg * arg; + float4 arg4 = arg2 * arg2; /* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */ x = improve_5throot_solution(x, arg4); @@ -271,12 +271,12 @@ ccl_device_inline ssef fastpow24(const ssef &arg) return x * (x * x); } -ccl_device ssef color_srgb_to_linear(const ssef &c) +ccl_device float4 color_srgb_to_linear(const float4 &c) { - sseb cmp = c < ssef(0.04045f); - ssef lt = max(c * ssef(1.0f / 12.92f), ssef(0.0f)); - ssef gtebase = (c + ssef(0.055f)) * ssef(1.0f / 1.055f); /* fma */ - ssef gte = fastpow24(gtebase); + int4 cmp = c < make_float4(0.04045f); + float4 lt = max(c * make_float4(1.0f / 12.92f), make_float4(0.0f)); + float4 gtebase = (c + make_float4(0.055f)) * make_float4(1.0f / 1.055f); /* fma */ + float4 gte = fastpow24(gtebase); return select(cmp, lt, gte); } #endif /* __KERNEL_SSE2__ */ @@ -302,10 +302,8 @@ ccl_device float4 color_linear_to_srgb_v4(float4 c) ccl_device float4 color_srgb_to_linear_v4(float4 c) { #ifdef __KERNEL_SSE2__ - ssef r_ssef; - float4 &r = (float4 &)r_ssef; - r = c; - r_ssef = color_srgb_to_linear(r_ssef); + float4 r = c; + r = color_srgb_to_linear(r); r.w = c.w; return r; #else diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h index 1969529eff0..d5be14c8eba 100644 --- a/intern/cycles/util/defines.h +++ b/intern/cycles/util/defines.h @@ -23,6 +23,7 @@ /* Leave inlining decisions to compiler for these, the inline keyword here * is not about performance but including function definitions in headers. */ # define ccl_device static inline +# define ccl_device_extern extern "C" # define ccl_device_noinline static inline # define ccl_device_noinline_cpu ccl_device_noinline diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h index c668638eb02..5665dd4c075 100644 --- a/intern/cycles/util/half.h +++ b/intern/cycles/util/half.h @@ -154,17 +154,17 @@ ccl_device_inline half float_to_half_display(const float f) ccl_device_inline half4 float4_to_half4_display(const float4 f) { -#ifdef __KERNEL_SSE2__ +#ifdef __KERNEL_SSE__ /* CPU: SSE and AVX. */ - ssef x = min(max(load4f(f), 0.0f), 65504.0f); + float4 x = min(max(f, make_float4(0.0f)), make_float4(65504.0f)); # ifdef __KERNEL_AVX2__ - ssei rpack = _mm_cvtps_ph(x, 0); + int4 rpack = int4(_mm_cvtps_ph(x, 0)); # else - ssei absolute = cast(x) & 0x7FFFFFFF; - ssei Z = absolute + 0xC8000000; - ssei result = andnot(absolute < 0x38800000, Z); - ssei rshift = (result >> 13) & 0x7FFF; - ssei rpack = _mm_packs_epi32(rshift, rshift); + int4 absolute = cast(x) & make_int4(0x7FFFFFFF); + int4 Z = absolute + make_int4(0xC8000000); + int4 result = andnot(absolute < make_int4(0x38800000), Z); + int4 rshift = (result >> 13) & make_int4(0x7FFF); + int4 rpack = int4(_mm_packs_epi32(rshift, rshift)); # endif half4 h; _mm_storel_pi((__m64 *)&h, _mm_castsi128_ps(rpack)); diff --git a/intern/cycles/util/hash.h b/intern/cycles/util/hash.h index 4f83f331229..74210ff020e 100644 --- a/intern/cycles/util/hash.h +++ b/intern/cycles/util/hash.h @@ -222,7 +222,7 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k) /* SSE Versions Of Jenkins Lookup3 Hash Functions */ -#ifdef __KERNEL_SSE2__ +#ifdef __KERNEL_SSE__ # define rot(x, k) (((x) << (k)) | (srl(x, 32 - (k)))) # define mix(a, b, c) \ @@ -265,10 +265,10 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k) c -= rot(b, 24); \ } -ccl_device_inline ssei hash_ssei(ssei kx) +ccl_device_inline int4 hash_int4(int4 kx) { - ssei a, b, c; - a = b = c = ssei(0xdeadbeef + (1 << 2) + 13); + int4 a, b, c; + a = b = c = make_int4(0xdeadbeef + (1 << 2) + 13); a += kx; final(a, b, c); @@ -276,10 +276,10 @@ ccl_device_inline ssei hash_ssei(ssei kx) return c; } -ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky) +ccl_device_inline int4 hash_int4_2(int4 kx, int4 ky) { - ssei a, b, c; - a = b = c = ssei(0xdeadbeef + (2 << 2) + 13); + int4 a, b, c; + a = b = c = make_int4(0xdeadbeef + (2 << 2) + 13); b += ky; a += kx; @@ -288,10 +288,10 @@ ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky) return c; } -ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz) +ccl_device_inline int4 hash_int4_3(int4 kx, int4 ky, int4 kz) { - ssei a, b, c; - a = b = c = ssei(0xdeadbeef + (3 << 2) + 13); + int4 a, b, c; + a = b = c = make_int4(0xdeadbeef + (3 << 2) + 13); c += kz; b += ky; @@ -301,10 +301,10 @@ ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz) return c; } -ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw) +ccl_device_inline int4 hash_int4_4(int4 kx, int4 ky, int4 kz, int4 kw) { - ssei a, b, c; - a = b = c = ssei(0xdeadbeef + (4 << 2) + 13); + int4 a, b, c; + a = b = c = make_int4(0xdeadbeef + (4 << 2) + 13); a += kx; b += ky; @@ -317,11 +317,11 @@ ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw) return c; } -# if defined(__KERNEL_AVX__) -ccl_device_inline avxi hash_avxi(avxi kx) +# if defined(__KERNEL_AVX2__) +ccl_device_inline vint8 hash_int8(vint8 kx) { - avxi a, b, c; - a = b = c = avxi(0xdeadbeef + (1 << 2) + 13); + vint8 a, b, c; + a = b = c = make_vint8(0xdeadbeef + (1 << 2) + 13); a += kx; final(a, b, c); @@ -329,10 +329,10 @@ ccl_device_inline avxi hash_avxi(avxi kx) return c; } -ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky) +ccl_device_inline vint8 hash_int8_2(vint8 kx, vint8 ky) { - avxi a, b, c; - a = b = c = avxi(0xdeadbeef + (2 << 2) + 13); + vint8 a, b, c; + a = b = c = make_vint8(0xdeadbeef + (2 << 2) + 13); b += ky; a += kx; @@ -341,10 +341,10 @@ ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky) return c; } -ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz) +ccl_device_inline vint8 hash_int8_3(vint8 kx, vint8 ky, vint8 kz) { - avxi a, b, c; - a = b = c = avxi(0xdeadbeef + (3 << 2) + 13); + vint8 a, b, c; + a = b = c = make_vint8(0xdeadbeef + (3 << 2) + 13); c += kz; b += ky; @@ -354,10 +354,10 @@ ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz) return c; } -ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw) +ccl_device_inline vint8 hash_int8_4(vint8 kx, vint8 ky, vint8 kz, vint8 kw) { - avxi a, b, c; - a = b = c = avxi(0xdeadbeef + (4 << 2) + 13); + vint8 a, b, c; + a = b = c = make_vint8(0xdeadbeef + (4 << 2) + 13); a += kx; b += ky; diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index 3a2e0e074a2..0fbe7a67a4f 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -532,12 +532,14 @@ CCL_NAMESPACE_END #include "util/math_int2.h" #include "util/math_int3.h" #include "util/math_int4.h" +#include "util/math_int8.h" #include "util/math_float2.h" -#include "util/math_float3.h" #include "util/math_float4.h" #include "util/math_float8.h" +#include "util/math_float3.h" + #include "util/rect.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h index 542dad93467..ad806d0f08a 100644 --- a/intern/cycles/util/math_float2.h +++ b/intern/cycles/util/math_float2.h @@ -10,55 +10,6 @@ CCL_NAMESPACE_BEGIN -/******************************************************************************* - * Declaration. - */ - -#if !defined(__KERNEL_METAL__) -ccl_device_inline float2 operator-(const float2 &a); -ccl_device_inline float2 operator*(const float2 &a, const float2 &b); -ccl_device_inline float2 operator*(const float2 &a, float f); -ccl_device_inline float2 operator*(float f, const float2 &a); -ccl_device_inline float2 operator/(float f, const float2 &a); -ccl_device_inline float2 operator/(const float2 &a, float f); -ccl_device_inline float2 operator/(const float2 &a, const float2 &b); -ccl_device_inline float2 operator+(const float2 &a, const float f); -ccl_device_inline float2 operator+(const float2 &a, const float2 &b); -ccl_device_inline float2 operator-(const float2 &a, const float f); -ccl_device_inline float2 operator-(const float2 &a, const float2 &b); -ccl_device_inline float2 operator+=(float2 &a, const float2 &b); -ccl_device_inline float2 operator*=(float2 &a, const float2 &b); -ccl_device_inline float2 operator*=(float2 &a, float f); -ccl_device_inline float2 operator/=(float2 &a, const float2 &b); -ccl_device_inline float2 operator/=(float2 &a, float f); - -ccl_device_inline bool operator==(const float2 &a, const float2 &b); -ccl_device_inline bool operator!=(const float2 &a, const float2 &b); - -ccl_device_inline bool is_zero(const float2 &a); -ccl_device_inline float average(const float2 &a); -ccl_device_inline float distance(const float2 &a, const float2 &b); -ccl_device_inline float dot(const float2 &a, const float2 &b); -ccl_device_inline float cross(const float2 &a, const float2 &b); -ccl_device_inline float len(const float2 a); -ccl_device_inline float2 normalize(const float2 &a); -ccl_device_inline float2 normalize_len(const float2 &a, float *t); -ccl_device_inline float2 safe_normalize(const float2 &a); -ccl_device_inline float2 min(const float2 &a, const float2 &b); -ccl_device_inline float2 max(const float2 &a, const float2 &b); -ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx); -ccl_device_inline float2 fabs(const float2 &a); -ccl_device_inline float2 as_float2(const float4 &a); -ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t); -ccl_device_inline float2 floor(const float2 &a); -#endif /* !__KERNEL_METAL__ */ - -ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b); - -/******************************************************************************* - * Definition. - */ - ccl_device_inline float2 zero_float2() { return make_float2(0.0f, 0.0f); @@ -75,63 +26,63 @@ ccl_device_inline float2 operator-(const float2 &a) return make_float2(-a.x, -a.y); } -ccl_device_inline float2 operator*(const float2 &a, const float2 &b) +ccl_device_inline float2 operator*(const float2 a, const float2 b) { return make_float2(a.x * b.x, a.y * b.y); } -ccl_device_inline float2 operator*(const float2 &a, float f) +ccl_device_inline float2 operator*(const float2 a, float f) { return make_float2(a.x * f, a.y * f); } -ccl_device_inline float2 operator*(float f, const float2 &a) +ccl_device_inline float2 operator*(float f, const float2 a) { return make_float2(a.x * f, a.y * f); } -ccl_device_inline float2 operator/(float f, const float2 &a) +ccl_device_inline float2 operator/(float f, const float2 a) { return make_float2(f / a.x, f / a.y); } -ccl_device_inline float2 operator/(const float2 &a, float f) +ccl_device_inline float2 operator/(const float2 a, float f) { float invf = 1.0f / f; return make_float2(a.x * invf, a.y * invf); } -ccl_device_inline float2 operator/(const float2 &a, const float2 &b) +ccl_device_inline float2 operator/(const float2 a, const float2 b) { return make_float2(a.x / b.x, a.y / b.y); } -ccl_device_inline float2 operator+(const float2 &a, const float f) +ccl_device_inline float2 operator+(const float2 a, const float2 b) { - return a + make_float2(f, f); + return make_float2(a.x + b.x, a.y + b.y); } -ccl_device_inline float2 operator+(const float2 &a, const float2 &b) +ccl_device_inline float2 operator+(const float2 a, const float f) { - return make_float2(a.x + b.x, a.y + b.y); + return a + make_float2(f, f); } -ccl_device_inline float2 operator-(const float2 &a, const float f) +ccl_device_inline float2 operator-(const float2 a, const float2 b) { - return a - make_float2(f, f); + return make_float2(a.x - b.x, a.y - b.y); } -ccl_device_inline float2 operator-(const float2 &a, const float2 &b) +ccl_device_inline float2 operator-(const float2 a, const float f) { - return make_float2(a.x - b.x, a.y - b.y); + return a - make_float2(f, f); } -ccl_device_inline float2 operator+=(float2 &a, const float2 &b) +ccl_device_inline float2 operator+=(float2 &a, const float2 b) { return a = a + b; } -ccl_device_inline float2 operator*=(float2 &a, const float2 &b) +ccl_device_inline float2 operator*=(float2 &a, const float2 b) { return a = a * b; } @@ -141,7 +92,7 @@ ccl_device_inline float2 operator*=(float2 &a, float f) return a = a * f; } -ccl_device_inline float2 operator/=(float2 &a, const float2 &b) +ccl_device_inline float2 operator/=(float2 &a, const float2 b) { return a = a / b; } @@ -152,74 +103,81 @@ ccl_device_inline float2 operator/=(float2 &a, float f) return a = a * invf; } -ccl_device_inline bool operator==(const float2 &a, const float2 &b) +ccl_device_inline bool operator==(const float2 a, const float2 b) { return (a.x == b.x && a.y == b.y); } -ccl_device_inline bool operator!=(const float2 &a, const float2 &b) +ccl_device_inline bool operator!=(const float2 a, const float2 b) { return !(a == b); } -ccl_device_inline bool is_zero(const float2 &a) +ccl_device_inline bool is_zero(const float2 a) { return (a.x == 0.0f && a.y == 0.0f); } -ccl_device_inline float average(const float2 &a) +ccl_device_inline float average(const float2 a) { return (a.x + a.y) * (1.0f / 2.0f); } -ccl_device_inline float distance(const float2 &a, const float2 &b) +ccl_device_inline float dot(const float2 a, const float2 b) { - return len(a - b); + return a.x * b.x + a.y * b.y; } +#endif -ccl_device_inline float dot(const float2 &a, const float2 &b) +ccl_device_inline float len(const float2 a) { - return a.x * b.x + a.y * b.y; + return sqrtf(dot(a, a)); } -ccl_device_inline float cross(const float2 &a, const float2 &b) +#if !defined(__KERNEL_METAL__) +ccl_device_inline float distance(const float2 a, const float2 b) +{ + return len(a - b); +} + +ccl_device_inline float cross(const float2 a, const float2 b) { return (a.x * b.y - a.y * b.x); } -ccl_device_inline float2 normalize(const float2 &a) +ccl_device_inline float2 normalize(const float2 a) { return a / len(a); } -ccl_device_inline float2 normalize_len(const float2 &a, ccl_private float *t) +ccl_device_inline float2 normalize_len(const float2 a, ccl_private float *t) { *t = len(a); return a / (*t); } -ccl_device_inline float2 safe_normalize(const float2 &a) +ccl_device_inline float2 safe_normalize(const float2 a) { float t = len(a); return (t != 0.0f) ? a / t : a; } -ccl_device_inline float2 min(const float2 &a, const float2 &b) +ccl_device_inline float2 min(const float2 a, const float2 b) { return make_float2(min(a.x, b.x), min(a.y, b.y)); } -ccl_device_inline float2 max(const float2 &a, const float2 &b) +ccl_device_inline float2 max(const float2 a, const float2 b) { return make_float2(max(a.x, b.x), max(a.y, b.y)); } -ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx) +ccl_device_inline float2 clamp(const float2 a, const float2 mn, const float2 mx) { return min(max(a, mn), mx); } -ccl_device_inline float2 fabs(const float2 &a) +ccl_device_inline float2 fabs(const float2 a) { return make_float2(fabsf(a.x), fabsf(a.y)); } @@ -229,28 +187,23 @@ ccl_device_inline float2 as_float2(const float4 &a) return make_float2(a.x, a.y); } -ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t) +ccl_device_inline float2 interp(const float2 a, const float2 b, float t) { return a + t * (b - a); } -ccl_device_inline float2 mix(const float2 &a, const float2 &b, float t) +ccl_device_inline float2 mix(const float2 a, const float2 b, float t) { return a + t * (b - a); } -ccl_device_inline float2 floor(const float2 &a) +ccl_device_inline float2 floor(const float2 a) { return make_float2(floorf(a.x), floorf(a.y)); } #endif /* !__KERNEL_METAL__ */ -ccl_device_inline float len(const float2 a) -{ - return sqrtf(dot(a, a)); -} - ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b) { return (b != 0.0f) ? a / b : zero_float2(); diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index eec7122b9dc..79ee86d9c82 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2013 Intel Corporation * Copyright 2011-2022 Blender Foundation */ #ifndef __UTIL_MATH_FLOAT3_H__ @@ -10,73 +11,6 @@ CCL_NAMESPACE_BEGIN -/******************************************************************************* - * Declaration. - */ - -#if !defined(__KERNEL_METAL__) -ccl_device_inline float3 operator-(const float3 &a); -ccl_device_inline float3 operator*(const float3 &a, const float3 &b); -ccl_device_inline float3 operator*(const float3 &a, const float f); -ccl_device_inline float3 operator*(const float f, const float3 &a); -ccl_device_inline float3 operator/(const float f, const float3 &a); -ccl_device_inline float3 operator/(const float3 &a, const float f); -ccl_device_inline float3 operator/(const float3 &a, const float3 &b); -ccl_device_inline float3 operator+(const float3 &a, const float f); -ccl_device_inline float3 operator+(const float3 &a, const float3 &b); -ccl_device_inline float3 operator-(const float3 &a, const float f); -ccl_device_inline float3 operator-(const float3 &a, const float3 &b); -ccl_device_inline float3 operator+=(float3 &a, const float3 &b); -ccl_device_inline float3 operator-=(float3 &a, const float3 &b); -ccl_device_inline float3 operator*=(float3 &a, const float3 &b); -ccl_device_inline float3 operator*=(float3 &a, float f); -ccl_device_inline float3 operator/=(float3 &a, const float3 &b); -ccl_device_inline float3 operator/=(float3 &a, float f); - -ccl_device_inline bool operator==(const float3 &a, const float3 &b); -ccl_device_inline bool operator!=(const float3 &a, const float3 &b); - -ccl_device_inline float distance(const float3 &a, const float3 &b); -ccl_device_inline float dot(const float3 &a, const float3 &b); -ccl_device_inline float dot_xy(const float3 &a, const float3 &b); -ccl_device_inline float3 cross(const float3 &a, const float3 &b); -ccl_device_inline float3 normalize(const float3 &a); -ccl_device_inline float3 min(const float3 &a, const float3 &b); -ccl_device_inline float3 max(const float3 &a, const float3 &b); -ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx); -ccl_device_inline float3 fabs(const float3 &a); -ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t); -ccl_device_inline float3 rcp(const float3 &a); -ccl_device_inline float3 sqrt(const float3 &a); -ccl_device_inline float3 floor(const float3 &a); -ccl_device_inline float3 ceil(const float3 &a); -ccl_device_inline float3 reflect(const float3 incident, const float3 normal); -#endif /* !defined(__KERNEL_METAL__) */ - -ccl_device_inline float reduce_min(float3 a); -ccl_device_inline float reduce_max(float3 a); -ccl_device_inline float len(const float3 a); -ccl_device_inline float len_squared(const float3 a); - -ccl_device_inline float3 project(const float3 v, const float3 v_proj); - -ccl_device_inline float3 safe_normalize(const float3 a); -ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t); -ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t); -ccl_device_inline float3 safe_divide(const float3 a, const float3 b); -ccl_device_inline float3 safe_divide(const float3 a, const float b); -ccl_device_inline float3 interp(float3 a, float3 b, float t); -ccl_device_inline float3 sqr(float3 a); - -ccl_device_inline bool is_zero(const float3 a); -ccl_device_inline float reduce_add(const float3 a); -ccl_device_inline float average(const float3 a); -ccl_device_inline bool isequal(const float3 a, const float3 b); - -/******************************************************************************* - * Definition. - */ - ccl_device_inline float3 zero_float3() { #ifdef __KERNEL_SSE__ @@ -109,7 +43,7 @@ ccl_device_inline float3 operator-(const float3 &a) # endif } -ccl_device_inline float3 operator*(const float3 &a, const float3 &b) +ccl_device_inline float3 operator*(const float3 a, const float3 b) { # ifdef __KERNEL_SSE__ return float3(_mm_mul_ps(a.m128, b.m128)); @@ -118,7 +52,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float3 &b) # endif } -ccl_device_inline float3 operator*(const float3 &a, const float f) +ccl_device_inline float3 operator*(const float3 a, const float f) { # ifdef __KERNEL_SSE__ return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f))); @@ -127,7 +61,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float f) # endif } -ccl_device_inline float3 operator*(const float f, const float3 &a) +ccl_device_inline float3 operator*(const float f, const float3 a) { # if defined(__KERNEL_SSE__) return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); @@ -136,7 +70,7 @@ ccl_device_inline float3 operator*(const float f, const float3 &a) # endif } -ccl_device_inline float3 operator/(const float f, const float3 &a) +ccl_device_inline float3 operator/(const float f, const float3 a) { # if defined(__KERNEL_SSE__) return float3(_mm_div_ps(_mm_set1_ps(f), a.m128)); @@ -145,7 +79,7 @@ ccl_device_inline float3 operator/(const float f, const float3 &a) # endif } -ccl_device_inline float3 operator/(const float3 &a, const float f) +ccl_device_inline float3 operator/(const float3 a, const float f) { # if defined(__KERNEL_SSE__) return float3(_mm_div_ps(a.m128, _mm_set1_ps(f))); @@ -154,7 +88,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float f) # endif } -ccl_device_inline float3 operator/(const float3 &a, const float3 &b) +ccl_device_inline float3 operator/(const float3 a, const float3 b) { # if defined(__KERNEL_SSE__) return float3(_mm_div_ps(a.m128, b.m128)); @@ -163,12 +97,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float3 &b) # endif } -ccl_device_inline float3 operator+(const float3 &a, const float f) -{ - return a + make_float3(f, f, f); -} - -ccl_device_inline float3 operator+(const float3 &a, const float3 &b) +ccl_device_inline float3 operator+(const float3 a, const float3 b) { # ifdef __KERNEL_SSE__ return float3(_mm_add_ps(a.m128, b.m128)); @@ -177,12 +106,12 @@ ccl_device_inline float3 operator+(const float3 &a, const float3 &b) # endif } -ccl_device_inline float3 operator-(const float3 &a, const float f) +ccl_device_inline float3 operator+(const float3 a, const float f) { - return a - make_float3(f, f, f); + return a + make_float3(f, f, f); } -ccl_device_inline float3 operator-(const float3 &a, const float3 &b) +ccl_device_inline float3 operator-(const float3 a, const float3 b) { # ifdef __KERNEL_SSE__ return float3(_mm_sub_ps(a.m128, b.m128)); @@ -191,17 +120,22 @@ ccl_device_inline float3 operator-(const float3 &a, const float3 &b) # endif } -ccl_device_inline float3 operator+=(float3 &a, const float3 &b) +ccl_device_inline float3 operator-(const float3 a, const float f) +{ + return a - make_float3(f, f, f); +} + +ccl_device_inline float3 operator+=(float3 &a, const float3 b) { return a = a + b; } -ccl_device_inline float3 operator-=(float3 &a, const float3 &b) +ccl_device_inline float3 operator-=(float3 &a, const float3 b) { return a = a - b; } -ccl_device_inline float3 operator*=(float3 &a, const float3 &b) +ccl_device_inline float3 operator*=(float3 &a, const float3 b) { return a = a * b; } @@ -211,7 +145,7 @@ ccl_device_inline float3 operator*=(float3 &a, float f) return a = a * f; } -ccl_device_inline float3 operator/=(float3 &a, const float3 &b) +ccl_device_inline float3 operator/=(float3 &a, const float3 b) { return a = a / b; } @@ -223,7 +157,7 @@ ccl_device_inline float3 operator/=(float3 &a, float f) } # if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__)) -ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b) +ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 b) { a = float3(a) * b; return a; @@ -235,7 +169,7 @@ ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f) return a; } -ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b) +ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 b) { a = float3(a) / b; return a; @@ -248,7 +182,7 @@ ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f) } # endif -ccl_device_inline bool operator==(const float3 &a, const float3 &b) +ccl_device_inline bool operator==(const float3 a, const float3 b) { # ifdef __KERNEL_SSE__ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; @@ -257,17 +191,12 @@ ccl_device_inline bool operator==(const float3 &a, const float3 &b) # endif } -ccl_device_inline bool operator!=(const float3 &a, const float3 &b) +ccl_device_inline bool operator!=(const float3 a, const float3 b) { return !(a == b); } -ccl_device_inline float distance(const float3 &a, const float3 &b) -{ - return len(a - b); -} - -ccl_device_inline float dot(const float3 &a, const float3 &b) +ccl_device_inline float dot(const float3 a, const float3 b) { # if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); @@ -276,26 +205,62 @@ ccl_device_inline float dot(const float3 &a, const float3 &b) # endif } -ccl_device_inline float dot_xy(const float3 &a, const float3 &b) +#endif + +ccl_device_inline float dot_xy(const float3 a, const float3 b) { -# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b)); -# else +#else return a.x * b.x + a.y * b.y; -# endif +#endif +} + +ccl_device_inline float len(const float3 a) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F))); +#else + return sqrtf(dot(a, a)); +#endif +} + +ccl_device_inline float reduce_min(float3 a) +{ + return min(min(a.x, a.y), a.z); +} + +ccl_device_inline float reduce_max(float3 a) +{ + return max(max(a.x, a.y), a.z); +} + +ccl_device_inline float len_squared(const float3 a) +{ + return dot(a, a); +} + +#ifndef __KERNEL_METAL__ + +ccl_device_inline float distance(const float3 a, const float3 b) +{ + return len(a - b); } -ccl_device_inline float3 cross(const float3 &a, const float3 &b) +ccl_device_inline float3 cross(const float3 a, const float3 b) { # ifdef __KERNEL_SSE__ - return float3(shuffle<1, 2, 0, 3>( - msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b)))); + const float4 x = float4(a.m128); + const float4 y = shuffle<1, 2, 0, 3>(float4(b.m128)); + const float4 z = float4(_mm_mul_ps(shuffle<1, 2, 0, 3>(float4(a.m128)), float4(b.m128))); + + return float3(shuffle<1, 2, 0, 3>(msub(x, y, z)).m128); # else return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); # endif } -ccl_device_inline float3 normalize(const float3 &a) +ccl_device_inline float3 normalize(const float3 a) { # if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); @@ -305,7 +270,7 @@ ccl_device_inline float3 normalize(const float3 &a) # endif } -ccl_device_inline float3 min(const float3 &a, const float3 &b) +ccl_device_inline float3 min(const float3 a, const float3 b) { # ifdef __KERNEL_SSE__ return float3(_mm_min_ps(a.m128, b.m128)); @@ -314,7 +279,7 @@ ccl_device_inline float3 min(const float3 &a, const float3 &b) # endif } -ccl_device_inline float3 max(const float3 &a, const float3 &b) +ccl_device_inline float3 max(const float3 a, const float3 b) { # ifdef __KERNEL_SSE__ return float3(_mm_max_ps(a.m128, b.m128)); @@ -323,12 +288,12 @@ ccl_device_inline float3 max(const float3 &a, const float3 &b) # endif } -ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx) +ccl_device_inline float3 clamp(const float3 a, const float3 mn, const float3 mx) { return min(max(a, mn), mx); } -ccl_device_inline float3 fabs(const float3 &a) +ccl_device_inline float3 fabs(const float3 a) { # ifdef __KERNEL_SSE__ # ifdef __KERNEL_NEON__ @@ -342,7 +307,7 @@ ccl_device_inline float3 fabs(const float3 &a) # endif } -ccl_device_inline float3 sqrt(const float3 &a) +ccl_device_inline float3 sqrt(const float3 a) { # ifdef __KERNEL_SSE__ return float3(_mm_sqrt_ps(a)); @@ -351,7 +316,7 @@ ccl_device_inline float3 sqrt(const float3 &a) # endif } -ccl_device_inline float3 floor(const float3 &a) +ccl_device_inline float3 floor(const float3 a) { # ifdef __KERNEL_SSE__ return float3(_mm_floor_ps(a)); @@ -360,7 +325,7 @@ ccl_device_inline float3 floor(const float3 &a) # endif } -ccl_device_inline float3 ceil(const float3 &a) +ccl_device_inline float3 ceil(const float3 a) { # ifdef __KERNEL_SSE__ return float3(_mm_ceil_ps(a)); @@ -369,12 +334,12 @@ ccl_device_inline float3 ceil(const float3 &a) # endif } -ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t) +ccl_device_inline float3 mix(const float3 a, const float3 b, float t) { return a + t * (b - a); } -ccl_device_inline float3 rcp(const float3 &a) +ccl_device_inline float3 rcp(const float3 a) { # ifdef __KERNEL_SSE__ /* Don't use _mm_rcp_ps due to poor precision. */ @@ -399,33 +364,6 @@ ccl_device_inline float3 log(float3 v) return make_float3(logf(v.x), logf(v.y), logf(v.z)); } -#endif /* !__KERNEL_METAL__ */ - -ccl_device_inline float reduce_min(float3 a) -{ - return min(min(a.x, a.y), a.z); -} - -ccl_device_inline float reduce_max(float3 a) -{ - return max(max(a.x, a.y), a.z); -} - -ccl_device_inline float len(const float3 a) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F))); -#else - return sqrtf(dot(a, a)); -#endif -} - -ccl_device_inline float len_squared(const float3 a) -{ - return dot(a, a); -} - -#if !defined(__KERNEL_METAL__) ccl_device_inline float3 reflect(const float3 incident, const float3 normal) { float3 unit_normal = normalize(normal); diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h index c2721873037..301d2d789c0 100644 --- a/intern/cycles/util/math_float4.h +++ b/intern/cycles/util/math_float4.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2013 Intel Corporation * Copyright 2011-2022 Blender Foundation */ #ifndef __UTIL_MATH_FLOAT4_H__ @@ -10,85 +11,6 @@ CCL_NAMESPACE_BEGIN -/******************************************************************************* - * Declaration. - */ - -#if !defined(__KERNEL_METAL__) -ccl_device_inline float4 operator-(const float4 &a); -ccl_device_inline float4 operator*(const float4 &a, const float4 &b); -ccl_device_inline float4 operator*(const float4 &a, float f); -ccl_device_inline float4 operator*(float f, const float4 &a); -ccl_device_inline float4 operator/(const float4 &a, float f); -ccl_device_inline float4 operator/(const float4 &a, const float4 &b); -ccl_device_inline float4 operator+(const float4 &a, const float f); -ccl_device_inline float4 operator+(const float4 &a, const float4 &b); -ccl_device_inline float4 operator-(const float4 &a, const float f); -ccl_device_inline float4 operator-(const float4 &a, const float4 &b); -ccl_device_inline float4 operator+=(float4 &a, const float4 &b); -ccl_device_inline float4 operator*=(float4 &a, const float4 &b); -ccl_device_inline float4 operator*=(float4 &a, float f); -ccl_device_inline float4 operator/=(float4 &a, float f); - -ccl_device_inline int4 operator<(const float4 &a, const float4 &b); -ccl_device_inline int4 operator>=(const float4 &a, const float4 &b); -ccl_device_inline int4 operator<=(const float4 &a, const float4 &b); -ccl_device_inline bool operator==(const float4 &a, const float4 &b); - -ccl_device_inline float distance(const float4 &a, const float4 &b); -ccl_device_inline float dot(const float4 &a, const float4 &b); -ccl_device_inline float len_squared(const float4 &a); -ccl_device_inline float4 rcp(const float4 &a); -ccl_device_inline float4 sqrt(const float4 &a); -ccl_device_inline float4 sqr(const float4 &a); -ccl_device_inline float4 cross(const float4 &a, const float4 &b); -ccl_device_inline bool is_zero(const float4 &a); -ccl_device_inline float average(const float4 &a); -ccl_device_inline float len(const float4 &a); -ccl_device_inline float4 normalize(const float4 &a); -ccl_device_inline float4 safe_normalize(const float4 &a); -ccl_device_inline float4 min(const float4 &a, const float4 &b); -ccl_device_inline float4 max(const float4 &a, const float4 &b); -ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx); -ccl_device_inline float4 fabs(const float4 &a); -ccl_device_inline float4 floor(const float4 &a); -ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t); -#endif /* !__KERNEL_METAL__*/ - -ccl_device_inline float4 safe_divide(const float4 a, const float4 b); -ccl_device_inline float4 safe_divide(const float4 a, const float b); - -#ifdef __KERNEL_SSE__ -template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> -__forceinline const float4 shuffle(const float4 &b); -template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> -__forceinline const float4 shuffle(const float4 &a, const float4 &b); - -template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b); - -template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b); -template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b); - -# ifdef __KERNEL_SSE3__ -template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b); -template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b); -# endif -#endif /* __KERNEL_SSE__ */ - -ccl_device_inline float reduce_min(const float4 a); -ccl_device_inline float reduce_max(const float4 a); -ccl_device_inline float reduce_add(const float4 a); - -ccl_device_inline bool isequal(const float4 a, const float4 b); - -#ifndef __KERNEL_GPU__ -ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b); -#endif /* !__KERNEL_GPU__ */ - -/******************************************************************************* - * Definition. - */ - ccl_device_inline float4 zero_float4() { #ifdef __KERNEL_SSE__ @@ -103,6 +25,16 @@ ccl_device_inline float4 one_float4() return make_float4(1.0f, 1.0f, 1.0f, 1.0f); } +ccl_device_inline int4 cast(const float4 a) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_castps_si128(a)); +#else + return make_int4( + __float_as_int(a.x), __float_as_int(a.y), __float_as_int(a.z), __float_as_int(a.w)); +#endif +} + #if !defined(__KERNEL_METAL__) ccl_device_inline float4 operator-(const float4 &a) { @@ -114,7 +46,7 @@ ccl_device_inline float4 operator-(const float4 &a) # endif } -ccl_device_inline float4 operator*(const float4 &a, const float4 &b) +ccl_device_inline float4 operator*(const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ return float4(_mm_mul_ps(a.m128, b.m128)); @@ -123,7 +55,7 @@ ccl_device_inline float4 operator*(const float4 &a, const float4 &b) # endif } -ccl_device_inline float4 operator*(const float4 &a, float f) +ccl_device_inline float4 operator*(const float4 a, float f) { # if defined(__KERNEL_SSE__) return a * make_float4(f); @@ -132,17 +64,17 @@ ccl_device_inline float4 operator*(const float4 &a, float f) # endif } -ccl_device_inline float4 operator*(float f, const float4 &a) +ccl_device_inline float4 operator*(float f, const float4 a) { return a * f; } -ccl_device_inline float4 operator/(const float4 &a, float f) +ccl_device_inline float4 operator/(const float4 a, float f) { return a * (1.0f / f); } -ccl_device_inline float4 operator/(const float4 &a, const float4 &b) +ccl_device_inline float4 operator/(const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ return float4(_mm_div_ps(a.m128, b.m128)); @@ -151,12 +83,7 @@ ccl_device_inline float4 operator/(const float4 &a, const float4 &b) # endif } -ccl_device_inline float4 operator+(const float4 &a, const float f) -{ - return a + make_float4(f, f, f, f); -} - -ccl_device_inline float4 operator+(const float4 &a, const float4 &b) +ccl_device_inline float4 operator+(const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ return float4(_mm_add_ps(a.m128, b.m128)); @@ -165,12 +92,12 @@ ccl_device_inline float4 operator+(const float4 &a, const float4 &b) # endif } -ccl_device_inline float4 operator-(const float4 &a, const float f) +ccl_device_inline float4 operator+(const float4 a, const float f) { - return a - make_float4(f, f, f, f); + return a + make_float4(f); } -ccl_device_inline float4 operator-(const float4 &a, const float4 &b) +ccl_device_inline float4 operator-(const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ return float4(_mm_sub_ps(a.m128, b.m128)); @@ -179,17 +106,22 @@ ccl_device_inline float4 operator-(const float4 &a, const float4 &b) # endif } -ccl_device_inline float4 operator+=(float4 &a, const float4 &b) +ccl_device_inline float4 operator-(const float4 a, const float f) +{ + return a - make_float4(f); +} + +ccl_device_inline float4 operator+=(float4 &a, const float4 b) { return a = a + b; } -ccl_device_inline float4 operator-=(float4 &a, const float4 &b) +ccl_device_inline float4 operator-=(float4 &a, const float4 b) { return a = a - b; } -ccl_device_inline float4 operator*=(float4 &a, const float4 &b) +ccl_device_inline float4 operator*=(float4 &a, const float4 b) { return a = a * b; } @@ -204,7 +136,7 @@ ccl_device_inline float4 operator/=(float4 &a, float f) return a = a / f; } -ccl_device_inline int4 operator<(const float4 &a, const float4 &b) +ccl_device_inline int4 operator<(const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128))); @@ -213,7 +145,7 @@ ccl_device_inline int4 operator<(const float4 &a, const float4 &b) # endif } -ccl_device_inline int4 operator>=(const float4 &a, const float4 &b) +ccl_device_inline int4 operator>=(const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128))); @@ -222,7 +154,7 @@ ccl_device_inline int4 operator>=(const float4 &a, const float4 &b) # endif } -ccl_device_inline int4 operator<=(const float4 &a, const float4 &b) +ccl_device_inline int4 operator<=(const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128))); @@ -231,7 +163,7 @@ ccl_device_inline int4 operator<=(const float4 &a, const float4 &b) # endif } -ccl_device_inline bool operator==(const float4 &a, const float4 &b) +ccl_device_inline bool operator==(const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; @@ -240,160 +172,148 @@ ccl_device_inline bool operator==(const float4 &a, const float4 &b) # endif } -ccl_device_inline float distance(const float4 &a, const float4 &b) -{ - return len(a - b); -} - -ccl_device_inline float dot(const float4 &a, const float4 &b) +ccl_device_inline const float4 operator^(const float4 a, const float4 b) { -# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - __m128 t = vmulq_f32(a, b); - return vaddvq_f32(t); -# else - return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); -# endif +# ifdef __KERNEL_SSE__ + return float4(_mm_xor_ps(a.m128, b.m128)); # else - return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w); + return make_float4(__uint_as_float(__float_as_uint(a.x) ^ __float_as_uint(b.x)), + __uint_as_float(__float_as_uint(a.y) ^ __float_as_uint(b.y)), + __uint_as_float(__float_as_uint(a.z) ^ __float_as_uint(b.z)), + __uint_as_float(__float_as_uint(a.w) ^ __float_as_uint(b.w))); # endif } -ccl_device_inline float len_squared(const float4 &a) -{ - return dot(a, a); -} - -ccl_device_inline float4 rcp(const float4 &a) +ccl_device_inline float4 min(const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ - /* Don't use _mm_rcp_ps due to poor precision. */ - return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); + return float4(_mm_min_ps(a.m128, b.m128)); # else - return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w); + return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); # endif } -ccl_device_inline float4 sqrt(const float4 &a) +ccl_device_inline float4 max(const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ - return float4(_mm_sqrt_ps(a.m128)); + return float4(_mm_max_ps(a.m128, b.m128)); # else - return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); + return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); # endif } -ccl_device_inline float4 sqr(const float4 &a) +ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx) { - return a * a; + return min(max(a, mn), mx); } +#endif /* !__KERNEL_METAL__*/ -ccl_device_inline float4 cross(const float4 &a, const float4 &b) +ccl_device_inline const float4 madd(const float4 a, const float4 b, const float4 c) { -# ifdef __KERNEL_SSE__ - return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) - - (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b)); +#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_NEON__ + return float4(vfmaq_f32(c, a, b)); +# elif defined(__KERNEL_AVX2__) + return float4(_mm_fmadd_ps(a, b, c)); # else - return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f); + return a * b + c; # endif +#else + return a * b + c; +#endif } -ccl_device_inline bool is_zero(const float4 &a) +ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c) { -# ifdef __KERNEL_SSE__ - return a == zero_float4(); +#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_NEON__ + return float4(vfmaq_f32(vnegq_f32(c), a, b)); +# elif defined(__KERNEL_AVX2__) + return float4(_mm_fmsub_ps(a, b, c)); # else - return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); + return a * b - c; # endif +#else + return a * b - c; +#endif } -ccl_device_inline float average(const float4 &a) +#ifdef __KERNEL_SSE__ +template<size_t i0, size_t i1, size_t i2, size_t i3> +__forceinline const float4 shuffle(const float4 b) { - return reduce_add(a) * 0.25f; +# ifdef __KERNEL_NEON__ + return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128)); +# else + return float4( + _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)))); +# endif } -ccl_device_inline float len(const float4 &a) +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a) { - return sqrtf(dot(a, a)); + return float4(_mm_movelh_ps(a, a)); } -ccl_device_inline float4 normalize(const float4 &a) +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a) { - return a / len(a); + return float4(_mm_movehl_ps(a, a)); } -ccl_device_inline float4 safe_normalize(const float4 &a) +# ifdef __KERNEL_SSE3__ +template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 b) { - float t = len(a); - return (t != 0.0f) ? a / t : a; + return float4(_mm_moveldup_ps(b)); } -ccl_device_inline float4 min(const float4 &a, const float4 &b) +template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 b) { -# ifdef __KERNEL_SSE__ - return float4(_mm_min_ps(a.m128, b.m128)); -# else - return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); -# endif + return float4(_mm_movehdup_ps(b)); } +# endif /* __KERNEL_SSE3__ */ -ccl_device_inline float4 max(const float4 &a, const float4 &b) +template<size_t i0, size_t i1, size_t i2, size_t i3> +__forceinline const float4 shuffle(const float4 a, const float4 b) { -# ifdef __KERNEL_SSE__ - return float4(_mm_max_ps(a.m128, b.m128)); +# ifdef __KERNEL_NEON__ + return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b)); # else - return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); + return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0))); # endif } -ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx) +template<size_t i0> __forceinline const float4 shuffle(const float4 b) { - return min(max(a, mn), mx); + return shuffle<i0, i0, i0, i0>(b); } - -ccl_device_inline float4 fabs(const float4 &a) +template<size_t i0> __forceinline const float4 shuffle(const float4 a, const float4 b) { -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - return float4(vabsq_f32(a)); -# else - return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); -# endif -# else - return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); -# endif -} - -ccl_device_inline float4 floor(const float4 &a) -{ -# ifdef __KERNEL_SSE__ - return float4(_mm_floor_ps(a)); +# ifdef __KERNEL_NEON__ + return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b)); # else - return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); + return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0))); # endif } -ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b) { - return a + t * (b - a); + return float4(_mm_movelh_ps(a, b)); } -ccl_device_inline float4 saturate(const float4 &a) +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b) { - return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w)); + return float4(_mm_movehl_ps(b, a)); } -ccl_device_inline float4 exp(float4 v) +template<size_t i> __forceinline float extract(const float4 a) { - return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z)); + return _mm_cvtss_f32(shuffle<i, i, i, i>(a)); } - -ccl_device_inline float4 log(float4 v) +template<> __forceinline float extract<0>(const float4 a) { - return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z)); + return _mm_cvtss_f32(a); } - -#endif /* !__KERNEL_METAL__*/ +#endif ccl_device_inline float reduce_add(const float4 a) { @@ -440,77 +360,192 @@ ccl_device_inline float reduce_max(const float4 a) #endif } -ccl_device_inline bool isequal(const float4 a, const float4 b) +#if !defined(__KERNEL_METAL__) +ccl_device_inline float dot(const float4 a, const float4 b) { -#if defined(__KERNEL_METAL__) - return all(a == b); -#else - return a == b; -#endif +# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + __m128 t = vmulq_f32(a, b); + return vaddvq_f32(t); +# else + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); +# endif +# else + return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w); +# endif } +#endif /* !defined(__KERNEL_METAL__) */ -#ifdef __KERNEL_SSE__ -template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> -__forceinline const float4 shuffle(const float4 &b) +ccl_device_inline float len(const float4 a) { -# if defined(__KERNEL_NEON__) - return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128)); + return sqrtf(dot(a, a)); +} + +ccl_device_inline float len_squared(const float4 a) +{ + return dot(a, a); +} + +#if !defined(__KERNEL_METAL__) +ccl_device_inline float distance(const float4 a, const float4 b) +{ + return len(a - b); +} + +ccl_device_inline float4 rcp(const float4 a) +{ +# ifdef __KERNEL_SSE__ + /* Don't use _mm_rcp_ps due to poor precision. */ + return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); # else - return float4(_mm_castsi128_ps( - _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); + return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w); # endif } -template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> -__forceinline const float4 shuffle(const float4 &a, const float4 &b) +ccl_device_inline float4 sqrt(const float4 a) { -# if defined(__KERNEL_NEON__) - return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128)); +# ifdef __KERNEL_SSE__ + return float4(_mm_sqrt_ps(a.m128)); # else - return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0))); + return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); # endif } -template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b) +ccl_device_inline float4 sqr(const float4 a) { - return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)))); + return a * a; } -template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b) +ccl_device_inline float4 cross(const float4 a, const float4 b) { - return float4(_mm_movelh_ps(a.m128, b.m128)); +# ifdef __KERNEL_SSE__ + return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) - + (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b)); +# else + return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f); +# endif } -template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b) +ccl_device_inline bool is_zero(const float4 a) { - return float4(_mm_movehl_ps(b.m128, a.m128)); +# ifdef __KERNEL_SSE__ + return a == zero_float4(); +# else + return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); +# endif } -# ifdef __KERNEL_SSE3__ -template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b) +ccl_device_inline float average(const float4 a) { - return float4(_mm_moveldup_ps(b)); + return reduce_add(a) * 0.25f; } -template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b) +ccl_device_inline float4 normalize(const float4 a) { - return float4(_mm_movehdup_ps(b)); + return a / len(a); +} + +ccl_device_inline float4 safe_normalize(const float4 a) +{ + float t = len(a); + return (t != 0.0f) ? a / t : a; +} + +ccl_device_inline float4 fabs(const float4 a) +{ +# if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return float4(vabsq_f32(a)); +# else + return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); +# endif +# else + return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); +# endif +} + +ccl_device_inline float4 floor(const float4 a) +{ +# ifdef __KERNEL_SSE__ +# if defined(__KERNEL_NEON__) + return float4(vrndmq_f32(a)); +# else + return float4(_mm_floor_ps(a)); +# endif +# else + return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); +# endif +} + +ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i) +{ +# ifdef __KERNEL_SSE__ + const float4 f = floor(x); + *i = int4(_mm_cvttps_epi32(f.m128)); + return x - f; +# else + float4 r; + r.x = floorfrac(x.x, &i->x); + r.y = floorfrac(x.y, &i->y); + r.z = floorfrac(x.z, &i->z); + r.w = floorfrac(x.w, &i->w); + return r; +# endif +} + +ccl_device_inline float4 mix(const float4 a, const float4 b, float t) +{ + return a + t * (b - a); +} + +ccl_device_inline float4 mix(const float4 a, const float4 b, const float4 t) +{ + return a + t * (b - a); +} + +ccl_device_inline float4 saturate(const float4 a) +{ + return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w)); +} + +ccl_device_inline float4 exp(float4 v) +{ + return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z)); +} + +ccl_device_inline float4 log(float4 v) +{ + return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z)); +} + +#endif /* !__KERNEL_METAL__*/ + +ccl_device_inline bool isequal(const float4 a, const float4 b) +{ +#if defined(__KERNEL_METAL__) + return all(a == b); +#else + return a == b; +#endif } -# endif /* __KERNEL_SSE3__ */ -#endif /* __KERNEL_SSE__ */ #ifndef __KERNEL_GPU__ -ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b) +ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b) { # ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE41__ return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128))); +# else + return float4( + _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), a), _mm_andnot_ps(_mm_castsi128_ps(mask), b))); +# endif # else return make_float4( (mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w); # endif } -ccl_device_inline float4 mask(const int4 &mask, const float4 &a) +ccl_device_inline float4 mask(const int4 mask, const float4 a) { /* Replace elements of x with zero where mask isn't set. */ return select(mask, a, zero_float4()); diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h index b538cfbe70b..755a720a10b 100644 --- a/intern/cycles/util/math_float8.h +++ b/intern/cycles/util/math_float8.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2013 Intel Corporation * Copyright 2022 Blender Foundation */ #ifndef __UTIL_MATH_FLOAT8_H__ @@ -10,193 +11,138 @@ CCL_NAMESPACE_BEGIN -/******************************************************************************* - * Declaration. - */ - -ccl_device_inline float8_t operator+(const float8_t a, const float8_t b); -ccl_device_inline float8_t operator+(const float8_t a, const float f); -ccl_device_inline float8_t operator+(const float f, const float8_t a); - -ccl_device_inline float8_t operator-(const float8_t a); -ccl_device_inline float8_t operator-(const float8_t a, const float8_t b); -ccl_device_inline float8_t operator-(const float8_t a, const float f); -ccl_device_inline float8_t operator-(const float f, const float8_t a); - -ccl_device_inline float8_t operator*(const float8_t a, const float8_t b); -ccl_device_inline float8_t operator*(const float8_t a, const float f); -ccl_device_inline float8_t operator*(const float f, const float8_t a); - -ccl_device_inline float8_t operator/(const float8_t a, const float8_t b); -ccl_device_inline float8_t operator/(const float8_t a, float f); -ccl_device_inline float8_t operator/(const float f, const float8_t a); - -ccl_device_inline float8_t operator+=(float8_t a, const float8_t b); - -ccl_device_inline float8_t operator*=(float8_t a, const float8_t b); -ccl_device_inline float8_t operator*=(float8_t a, float f); - -ccl_device_inline float8_t operator/=(float8_t a, float f); - -ccl_device_inline bool operator==(const float8_t a, const float8_t b); - -ccl_device_inline float8_t rcp(const float8_t a); -ccl_device_inline float8_t sqrt(const float8_t a); -ccl_device_inline float8_t sqr(const float8_t a); -ccl_device_inline bool is_zero(const float8_t a); -ccl_device_inline float average(const float8_t a); -ccl_device_inline float8_t min(const float8_t a, const float8_t b); -ccl_device_inline float8_t max(const float8_t a, const float8_t b); -ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx); -ccl_device_inline float8_t fabs(const float8_t a); -ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t); -ccl_device_inline float8_t saturate(const float8_t a); - -ccl_device_inline float8_t safe_divide(const float8_t a, const float b); -ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b); - -ccl_device_inline float reduce_min(const float8_t a); -ccl_device_inline float reduce_max(const float8_t a); -ccl_device_inline float reduce_add(const float8_t a); - -ccl_device_inline bool isequal(const float8_t a, const float8_t b); - -/******************************************************************************* - * Definition. - */ - -ccl_device_inline float8_t zero_float8_t() +ccl_device_inline vfloat8 zero_vfloat8() { -#ifdef __KERNEL_AVX2__ - return float8_t(_mm256_setzero_ps()); +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_setzero_ps()); #else - return make_float8_t(0.0f); + return make_vfloat8(0.0f); #endif } -ccl_device_inline float8_t one_float8_t() +ccl_device_inline vfloat8 one_vfloat8() { - return make_float8_t(1.0f); + return make_vfloat8(1.0f); } -ccl_device_inline float8_t operator+(const float8_t a, const float8_t b) +ccl_device_inline vfloat8 operator+(const vfloat8 a, const vfloat8 b) { -#ifdef __KERNEL_AVX2__ - return float8_t(_mm256_add_ps(a.m256, b.m256)); +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_add_ps(a.m256, b.m256)); #else - return make_float8_t( + return make_vfloat8( a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h); #endif } -ccl_device_inline float8_t operator+(const float8_t a, const float f) +ccl_device_inline vfloat8 operator+(const vfloat8 a, const float f) { - return a + make_float8_t(f); + return a + make_vfloat8(f); } -ccl_device_inline float8_t operator+(const float f, const float8_t a) +ccl_device_inline vfloat8 operator+(const float f, const vfloat8 a) { - return make_float8_t(f) + a; + return make_vfloat8(f) + a; } -ccl_device_inline float8_t operator-(const float8_t a) +ccl_device_inline vfloat8 operator-(const vfloat8 a) { -#ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX__ __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - return float8_t(_mm256_xor_ps(a.m256, mask)); + return vfloat8(_mm256_xor_ps(a.m256, mask)); #else - return make_float8_t(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h); + return make_vfloat8(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h); #endif } -ccl_device_inline float8_t operator-(const float8_t a, const float8_t b) +ccl_device_inline vfloat8 operator-(const vfloat8 a, const vfloat8 b) { -#ifdef __KERNEL_AVX2__ - return float8_t(_mm256_sub_ps(a.m256, b.m256)); +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_sub_ps(a.m256, b.m256)); #else - return make_float8_t( + return make_vfloat8( a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h); #endif } -ccl_device_inline float8_t operator-(const float8_t a, const float f) +ccl_device_inline vfloat8 operator-(const vfloat8 a, const float f) { - return a - make_float8_t(f); + return a - make_vfloat8(f); } -ccl_device_inline float8_t operator-(const float f, const float8_t a) +ccl_device_inline vfloat8 operator-(const float f, const vfloat8 a) { - return make_float8_t(f) - a; + return make_vfloat8(f) - a; } -ccl_device_inline float8_t operator*(const float8_t a, const float8_t b) +ccl_device_inline vfloat8 operator*(const vfloat8 a, const vfloat8 b) { -#ifdef __KERNEL_AVX2__ - return float8_t(_mm256_mul_ps(a.m256, b.m256)); +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_mul_ps(a.m256, b.m256)); #else - return make_float8_t( + return make_vfloat8( a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h); #endif } -ccl_device_inline float8_t operator*(const float8_t a, const float f) +ccl_device_inline vfloat8 operator*(const vfloat8 a, const float f) { - return a * make_float8_t(f); + return a * make_vfloat8(f); } -ccl_device_inline float8_t operator*(const float f, const float8_t a) +ccl_device_inline vfloat8 operator*(const float f, const vfloat8 a) { - return make_float8_t(f) * a; + return make_vfloat8(f) * a; } -ccl_device_inline float8_t operator/(const float8_t a, const float8_t b) +ccl_device_inline vfloat8 operator/(const vfloat8 a, const vfloat8 b) { -#ifdef __KERNEL_AVX2__ - return float8_t(_mm256_div_ps(a.m256, b.m256)); +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_div_ps(a.m256, b.m256)); #else - return make_float8_t( + return make_vfloat8( a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h); #endif } -ccl_device_inline float8_t operator/(const float8_t a, const float f) +ccl_device_inline vfloat8 operator/(const vfloat8 a, const float f) { - return a / make_float8_t(f); + return a / make_vfloat8(f); } -ccl_device_inline float8_t operator/(const float f, const float8_t a) +ccl_device_inline vfloat8 operator/(const float f, const vfloat8 a) { - return make_float8_t(f) / a; + return make_vfloat8(f) / a; } -ccl_device_inline float8_t operator+=(float8_t a, const float8_t b) +ccl_device_inline vfloat8 operator+=(vfloat8 a, const vfloat8 b) { return a = a + b; } -ccl_device_inline float8_t operator-=(float8_t a, const float8_t b) +ccl_device_inline vfloat8 operator-=(vfloat8 a, const vfloat8 b) { return a = a - b; } -ccl_device_inline float8_t operator*=(float8_t a, const float8_t b) +ccl_device_inline vfloat8 operator*=(vfloat8 a, const vfloat8 b) { return a = a * b; } -ccl_device_inline float8_t operator*=(float8_t a, float f) +ccl_device_inline vfloat8 operator*=(vfloat8 a, float f) { return a = a * f; } -ccl_device_inline float8_t operator/=(float8_t a, float f) +ccl_device_inline vfloat8 operator/=(vfloat8 a, float f) { return a = a / f; } -ccl_device_inline bool operator==(const float8_t a, const float8_t b) +ccl_device_inline bool operator==(const vfloat8 a, const vfloat8 b) { -#ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX__ return (_mm256_movemask_ps(_mm256_castsi256_ps( _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) & 0b11111111) == 0b11111111; @@ -206,132 +152,180 @@ ccl_device_inline bool operator==(const float8_t a, const float8_t b) #endif } -ccl_device_inline float8_t rcp(const float8_t a) +ccl_device_inline const vfloat8 operator^(const vfloat8 a, const vfloat8 b) { -#ifdef __KERNEL_AVX2__ - return float8_t(_mm256_rcp_ps(a.m256)); +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_xor_ps(a.m256, b.m256)); #else - return make_float8_t(1.0f / a.a, - 1.0f / a.b, - 1.0f / a.c, - 1.0f / a.d, - 1.0f / a.e, - 1.0f / a.f, - 1.0f / a.g, - 1.0f / a.h); + return make_vfloat8(__uint_as_float(__float_as_uint(a.a) ^ __float_as_uint(b.a)), + __uint_as_float(__float_as_uint(a.b) ^ __float_as_uint(b.b)), + __uint_as_float(__float_as_uint(a.c) ^ __float_as_uint(b.c)), + __uint_as_float(__float_as_uint(a.d) ^ __float_as_uint(b.d)), + __uint_as_float(__float_as_uint(a.e) ^ __float_as_uint(b.e)), + __uint_as_float(__float_as_uint(a.f) ^ __float_as_uint(b.f)), + __uint_as_float(__float_as_uint(a.g) ^ __float_as_uint(b.g)), + __uint_as_float(__float_as_uint(a.h) ^ __float_as_uint(b.h))); #endif } -ccl_device_inline float8_t sqrt(const float8_t a) +ccl_device_inline vfloat8 rcp(const vfloat8 a) { -#ifdef __KERNEL_AVX2__ - return float8_t(_mm256_sqrt_ps(a.m256)); +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_rcp_ps(a.m256)); #else - return make_float8_t(sqrtf(a.a), - sqrtf(a.b), - sqrtf(a.c), - sqrtf(a.d), - sqrtf(a.e), - sqrtf(a.f), - sqrtf(a.g), - sqrtf(a.h)); + return make_vfloat8(1.0f / a.a, + 1.0f / a.b, + 1.0f / a.c, + 1.0f / a.d, + 1.0f / a.e, + 1.0f / a.f, + 1.0f / a.g, + 1.0f / a.h); #endif } -ccl_device_inline float8_t sqr(const float8_t a) +ccl_device_inline vfloat8 sqrt(const vfloat8 a) +{ +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_sqrt_ps(a.m256)); +#else + return make_vfloat8(sqrtf(a.a), + sqrtf(a.b), + sqrtf(a.c), + sqrtf(a.d), + sqrtf(a.e), + sqrtf(a.f), + sqrtf(a.g), + sqrtf(a.h)); +#endif +} + +ccl_device_inline vfloat8 sqr(const vfloat8 a) { return a * a; } -ccl_device_inline bool is_zero(const float8_t a) +ccl_device_inline bool is_zero(const vfloat8 a) { - return a == make_float8_t(0.0f); + return a == make_vfloat8(0.0f); } -ccl_device_inline float average(const float8_t a) +ccl_device_inline float reduce_add(const vfloat8 a) +{ +#ifdef __KERNEL_AVX__ + vfloat8 b(_mm256_hadd_ps(a.m256, a.m256)); + vfloat8 h(_mm256_hadd_ps(b.m256, b.m256)); + return h[0] + h[4]; +#else + return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h; +#endif +} + +ccl_device_inline float average(const vfloat8 a) { return reduce_add(a) / 8.0f; } -ccl_device_inline float8_t min(const float8_t a, const float8_t b) +ccl_device_inline vfloat8 min(const vfloat8 a, const vfloat8 b) { -#ifdef __KERNEL_AVX2__ - return float8_t(_mm256_min_ps(a.m256, b.m256)); +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_min_ps(a.m256, b.m256)); #else - return make_float8_t(min(a.a, b.a), - min(a.b, b.b), - min(a.c, b.c), - min(a.d, b.d), - min(a.e, b.e), - min(a.f, b.f), - min(a.g, b.g), - min(a.h, b.h)); + return make_vfloat8(min(a.a, b.a), + min(a.b, b.b), + min(a.c, b.c), + min(a.d, b.d), + min(a.e, b.e), + min(a.f, b.f), + min(a.g, b.g), + min(a.h, b.h)); #endif } -ccl_device_inline float8_t max(const float8_t a, const float8_t b) +ccl_device_inline vfloat8 max(const vfloat8 a, const vfloat8 b) { -#ifdef __KERNEL_AVX2__ - return float8_t(_mm256_max_ps(a.m256, b.m256)); +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_max_ps(a.m256, b.m256)); #else - return make_float8_t(max(a.a, b.a), - max(a.b, b.b), - max(a.c, b.c), - max(a.d, b.d), - max(a.e, b.e), - max(a.f, b.f), - max(a.g, b.g), - max(a.h, b.h)); + return make_vfloat8(max(a.a, b.a), + max(a.b, b.b), + max(a.c, b.c), + max(a.d, b.d), + max(a.e, b.e), + max(a.f, b.f), + max(a.g, b.g), + max(a.h, b.h)); #endif } -ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx) +ccl_device_inline vfloat8 clamp(const vfloat8 a, const vfloat8 mn, const vfloat8 mx) { return min(max(a, mn), mx); } -ccl_device_inline float8_t fabs(const float8_t a) +ccl_device_inline vfloat8 select(const vint8 mask, const vfloat8 a, const vfloat8 b) { -#ifdef __KERNEL_AVX2__ - return float8_t(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_blendv_ps(b, a, _mm256_castsi256_ps(mask))); #else - return make_float8_t(fabsf(a.a), - fabsf(a.b), - fabsf(a.c), - fabsf(a.d), - fabsf(a.e), - fabsf(a.f), - fabsf(a.g), - fabsf(a.h)); + return make_vfloat8((mask.a) ? a.a : b.a, + (mask.b) ? a.b : b.b, + (mask.c) ? a.c : b.c, + (mask.d) ? a.d : b.d, + (mask.e) ? a.e : b.e, + (mask.f) ? a.f : b.f, + (mask.g) ? a.g : b.g, + (mask.h) ? a.h : b.h); #endif } -ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t) +ccl_device_inline vfloat8 fabs(const vfloat8 a) +{ +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); +#else + return make_vfloat8(fabsf(a.a), + fabsf(a.b), + fabsf(a.c), + fabsf(a.d), + fabsf(a.e), + fabsf(a.f), + fabsf(a.g), + fabsf(a.h)); +#endif +} + +ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, float t) +{ + return a + t * (b - a); +} + +ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, vfloat8 t) { return a + t * (b - a); } -ccl_device_inline float8_t saturate(const float8_t a) +ccl_device_inline vfloat8 saturate(const vfloat8 a) { - return clamp(a, make_float8_t(0.0f), make_float8_t(1.0f)); + return clamp(a, make_vfloat8(0.0f), make_vfloat8(1.0f)); } -ccl_device_inline float8_t exp(float8_t v) +ccl_device_inline vfloat8 exp(vfloat8 v) { - return make_float8_t( + return make_vfloat8( expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h)); } -ccl_device_inline float8_t log(float8_t v) +ccl_device_inline vfloat8 log(vfloat8 v) { - return make_float8_t( + return make_vfloat8( logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h)); } -ccl_device_inline float dot(const float8_t a, const float8_t b) +ccl_device_inline float dot(const vfloat8 a, const vfloat8 b) { -#ifdef __KERNEL_AVX2__ - float8_t t(_mm256_dp_ps(a.m256, b.m256, 0xFF)); +#ifdef __KERNEL_AVX__ + vfloat8 t(_mm256_dp_ps(a.m256, b.m256, 0xFF)); return t[0] + t[4]; #else return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) + @@ -339,62 +333,51 @@ ccl_device_inline float dot(const float8_t a, const float8_t b) #endif } -ccl_device_inline float8_t pow(float8_t v, float e) +ccl_device_inline vfloat8 pow(vfloat8 v, float e) { - return make_float8_t(powf(v.a, e), - powf(v.b, e), - powf(v.c, e), - powf(v.d, e), - powf(v.e, e), - powf(v.f, e), - powf(v.g, e), - powf(v.h, e)); + return make_vfloat8(powf(v.a, e), + powf(v.b, e), + powf(v.c, e), + powf(v.d, e), + powf(v.e, e), + powf(v.f, e), + powf(v.g, e), + powf(v.h, e)); } -ccl_device_inline float reduce_min(const float8_t a) +ccl_device_inline float reduce_min(const vfloat8 a) { return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h))); } -ccl_device_inline float reduce_max(const float8_t a) +ccl_device_inline float reduce_max(const vfloat8 a) { return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h))); } -ccl_device_inline float reduce_add(const float8_t a) -{ -#ifdef __KERNEL_AVX2__ - float8_t b(_mm256_hadd_ps(a.m256, a.m256)); - float8_t h(_mm256_hadd_ps(b.m256, b.m256)); - return h[0] + h[4]; -#else - return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h; -#endif -} - -ccl_device_inline bool isequal(const float8_t a, const float8_t b) +ccl_device_inline bool isequal(const vfloat8 a, const vfloat8 b) { return a == b; } -ccl_device_inline float8_t safe_divide(const float8_t a, const float b) +ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const float b) { - return (b != 0.0f) ? a / b : make_float8_t(0.0f); + return (b != 0.0f) ? a / b : make_vfloat8(0.0f); } -ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b) +ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const vfloat8 b) { - return make_float8_t((b.a != 0.0f) ? a.a / b.a : 0.0f, - (b.b != 0.0f) ? a.b / b.b : 0.0f, - (b.c != 0.0f) ? a.c / b.c : 0.0f, - (b.d != 0.0f) ? a.d / b.d : 0.0f, - (b.e != 0.0f) ? a.e / b.e : 0.0f, - (b.f != 0.0f) ? a.f / b.f : 0.0f, - (b.g != 0.0f) ? a.g / b.g : 0.0f, - (b.h != 0.0f) ? a.h / b.h : 0.0f); + return make_vfloat8((b.a != 0.0f) ? a.a / b.a : 0.0f, + (b.b != 0.0f) ? a.b / b.b : 0.0f, + (b.c != 0.0f) ? a.c / b.c : 0.0f, + (b.d != 0.0f) ? a.d / b.d : 0.0f, + (b.e != 0.0f) ? a.e / b.e : 0.0f, + (b.f != 0.0f) ? a.f / b.f : 0.0f, + (b.g != 0.0f) ? a.g / b.g : 0.0f, + (b.h != 0.0f) ? a.h / b.h : 0.0f); } -ccl_device_inline float8_t ensure_finite(float8_t v) +ccl_device_inline vfloat8 ensure_finite(vfloat8 v) { v.a = ensure_finite(v.a); v.b = ensure_finite(v.b); @@ -408,12 +391,92 @@ ccl_device_inline float8_t ensure_finite(float8_t v) return v; } -ccl_device_inline bool isfinite_safe(float8_t v) +ccl_device_inline bool isfinite_safe(vfloat8 v) { return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) && isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h); } +ccl_device_inline vint8 cast(const vfloat8 a) +{ +#ifdef __KERNEL_AVX__ + return vint8(_mm256_castps_si256(a)); +#else + return make_vint8(__float_as_int(a.a), + __float_as_int(a.b), + __float_as_int(a.c), + __float_as_int(a.d), + __float_as_int(a.e), + __float_as_int(a.f), + __float_as_int(a.g), + __float_as_int(a.h)); +#endif +} + +#ifdef __KERNEL_SSE__ +ccl_device_forceinline float4 low(const vfloat8 a) +{ +# ifdef __KERNEL_AVX__ + return float4(_mm256_extractf128_ps(a.m256, 0)); +# else + return make_float4(a.e, a.f, a.g, a.h); +# endif +} +ccl_device_forceinline float4 high(const vfloat8 a) +{ +# ifdef __KERNEL_AVX__ + return float4(_mm256_extractf128_ps(a.m256, 1)); +# else + return make_float4(a.a, a.b, a.c, a.d); +# endif +} + +template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> +ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a) +{ +# ifdef __KERNEL_AVX__ + return vfloat8(_mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0))); +# else + return make_vfloat8(a[i0], a[i1], a[i2], a[i3], a[i4 + 4], a[i5 + 4], a[i6 + 4], a[i7 + 4]); +# endif +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> +ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b) +{ +# ifdef __KERNEL_AVX__ + return vfloat8(_mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0))); +# else + return make_vfloat8(shuffle<i0, i1, i2, i3>(high(a), high(b)), + shuffle<i0, i1, i2, i3>(low(a), low(b))); +# endif +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> +ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a) +{ + return shuffle<i0, i1, i2, i3>(a, a); +} +template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b) +{ + return shuffle<i0, i0, i0, i0>(a, b); +} +template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a) +{ + return shuffle<i0>(a, a); +} + +template<size_t i> ccl_device_forceinline float extract(const vfloat8 a) +{ +# ifdef __KERNEL_AVX__ + __m256 b = shuffle<i, i, i, i>(a).m256; + return _mm256_cvtss_f32(b); +# else + return a[i]; +# endif +} +#endif + CCL_NAMESPACE_END #endif /* __UTIL_MATH_FLOAT8_H__ */ diff --git a/intern/cycles/util/math_int2.h b/intern/cycles/util/math_int2.h index f4d8a71221a..2df2ec5505b 100644 --- a/intern/cycles/util/math_int2.h +++ b/intern/cycles/util/math_int2.h @@ -10,23 +10,6 @@ CCL_NAMESPACE_BEGIN -/******************************************************************************* - * Declaration. - */ - -#if !defined(__KERNEL_METAL__) -ccl_device_inline bool operator==(const int2 a, const int2 b); -ccl_device_inline int2 operator+(const int2 &a, const int2 &b); -ccl_device_inline int2 operator+=(int2 &a, const int2 &b); -ccl_device_inline int2 operator-(const int2 &a, const int2 &b); -ccl_device_inline int2 operator*(const int2 &a, const int2 &b); -ccl_device_inline int2 operator/(const int2 &a, const int2 &b); -#endif /* !__KERNEL_METAL__ */ - -/******************************************************************************* - * Definition. - */ - #if !defined(__KERNEL_METAL__) ccl_device_inline bool operator==(const int2 a, const int2 b) { diff --git a/intern/cycles/util/math_int3.h b/intern/cycles/util/math_int3.h index 48bffeaf553..b5b972ddfb5 100644 --- a/intern/cycles/util/math_int3.h +++ b/intern/cycles/util/math_int3.h @@ -10,21 +10,6 @@ CCL_NAMESPACE_BEGIN -/******************************************************************************* - * Declaration. - */ - -#if !defined(__KERNEL_METAL__) -ccl_device_inline int3 min(int3 a, int3 b); -ccl_device_inline int3 max(int3 a, int3 b); -ccl_device_inline int3 clamp(const int3 &a, int mn, int mx); -ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx); -#endif /* !defined(__KERNEL_METAL__) */ - -/******************************************************************************* - * Definition. - */ - #if !defined(__KERNEL_METAL__) ccl_device_inline int3 min(int3 a, int3 b) { @@ -44,7 +29,7 @@ ccl_device_inline int3 max(int3 a, int3 b) # endif } -ccl_device_inline int3 clamp(const int3 &a, int mn, int mx) +ccl_device_inline int3 clamp(const int3 a, int mn, int mx) { # ifdef __KERNEL_SSE__ return min(max(a, make_int3(mn)), make_int3(mx)); @@ -53,7 +38,7 @@ ccl_device_inline int3 clamp(const int3 &a, int mn, int mx) # endif } -ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx) +ccl_device_inline int3 clamp(const int3 a, int3 &mn, int mx) { # ifdef __KERNEL_SSE__ return min(max(a, mn), make_int3(mx)); @@ -62,22 +47,22 @@ ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx) # endif } -ccl_device_inline bool operator==(const int3 &a, const int3 &b) +ccl_device_inline bool operator==(const int3 a, const int3 b) { return a.x == b.x && a.y == b.y && a.z == b.z; } -ccl_device_inline bool operator!=(const int3 &a, const int3 &b) +ccl_device_inline bool operator!=(const int3 a, const int3 b) { return !(a == b); } -ccl_device_inline bool operator<(const int3 &a, const int3 &b) +ccl_device_inline bool operator<(const int3 a, const int3 b) { return a.x < b.x && a.y < b.y && a.z < b.z; } -ccl_device_inline int3 operator+(const int3 &a, const int3 &b) +ccl_device_inline int3 operator+(const int3 a, const int3 b) { # ifdef __KERNEL_SSE__ return int3(_mm_add_epi32(a.m128, b.m128)); @@ -86,7 +71,7 @@ ccl_device_inline int3 operator+(const int3 &a, const int3 &b) # endif } -ccl_device_inline int3 operator-(const int3 &a, const int3 &b) +ccl_device_inline int3 operator-(const int3 a, const int3 b) { # ifdef __KERNEL_SSE__ return int3(_mm_sub_epi32(a.m128, b.m128)); diff --git a/intern/cycles/util/math_int4.h b/intern/cycles/util/math_int4.h index fbdada223cb..c6d767d7587 100644 --- a/intern/cycles/util/math_int4.h +++ b/intern/cycles/util/math_int4.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2013 Intel Corporation * Copyright 2011-2022 Blender Foundation */ #ifndef __UTIL_MATH_INT4_H__ @@ -10,30 +11,8 @@ CCL_NAMESPACE_BEGIN -/******************************************************************************* - * Declaration. - */ - #ifndef __KERNEL_GPU__ -ccl_device_inline int4 operator+(const int4 &a, const int4 &b); -ccl_device_inline int4 operator+=(int4 &a, const int4 &b); -ccl_device_inline int4 operator>>(const int4 &a, int i); -ccl_device_inline int4 operator<<(const int4 &a, int i); -ccl_device_inline int4 operator<(const int4 &a, const int4 &b); -ccl_device_inline int4 operator>=(const int4 &a, const int4 &b); -ccl_device_inline int4 operator&(const int4 &a, const int4 &b); -ccl_device_inline int4 min(int4 a, int4 b); -ccl_device_inline int4 max(int4 a, int4 b); -ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx); -ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b); -#endif /* __KERNEL_GPU__ */ - -/******************************************************************************* - * Definition. - */ - -#ifndef __KERNEL_GPU__ -ccl_device_inline int4 operator+(const int4 &a, const int4 &b) +ccl_device_inline int4 operator+(const int4 a, const int4 b) { # ifdef __KERNEL_SSE__ return int4(_mm_add_epi32(a.m128, b.m128)); @@ -42,12 +21,26 @@ ccl_device_inline int4 operator+(const int4 &a, const int4 &b) # endif } -ccl_device_inline int4 operator+=(int4 &a, const int4 &b) +ccl_device_inline int4 operator+=(int4 &a, const int4 b) { return a = a + b; } -ccl_device_inline int4 operator>>(const int4 &a, int i) +ccl_device_inline int4 operator-(const int4 a, const int4 b) +{ +# ifdef __KERNEL_SSE__ + return int4(_mm_sub_epi32(a.m128, b.m128)); +# else + return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +# endif +} + +ccl_device_inline int4 operator-=(int4 &a, const int4 b) +{ + return a = a - b; +} + +ccl_device_inline int4 operator>>(const int4 a, int i) { # ifdef __KERNEL_SSE__ return int4(_mm_srai_epi32(a.m128, i)); @@ -56,7 +49,7 @@ ccl_device_inline int4 operator>>(const int4 &a, int i) # endif } -ccl_device_inline int4 operator<<(const int4 &a, int i) +ccl_device_inline int4 operator<<(const int4 a, int i) { # ifdef __KERNEL_SSE__ return int4(_mm_slli_epi32(a.m128, i)); @@ -65,7 +58,7 @@ ccl_device_inline int4 operator<<(const int4 &a, int i) # endif } -ccl_device_inline int4 operator<(const int4 &a, const int4 &b) +ccl_device_inline int4 operator<(const int4 a, const int4 b) { # ifdef __KERNEL_SSE__ return int4(_mm_cmplt_epi32(a.m128, b.m128)); @@ -74,7 +67,26 @@ ccl_device_inline int4 operator<(const int4 &a, const int4 &b) # endif } -ccl_device_inline int4 operator>=(const int4 &a, const int4 &b) +ccl_device_inline int4 operator<(const int4 a, const int b) +{ + return a < make_int4(b); +} + +ccl_device_inline int4 operator==(const int4 a, const int4 b) +{ +# ifdef __KERNEL_SSE__ + return int4(_mm_cmpeq_epi32(a.m128, b.m128)); +# else + return make_int4(a.x == b.x, a.y == b.y, a.z == b.z, a.w == b.w); +# endif +} + +ccl_device_inline int4 operator==(const int4 a, const int b) +{ + return a == make_int4(b); +} + +ccl_device_inline int4 operator>=(const int4 a, const int4 b) { # ifdef __KERNEL_SSE__ return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128))); @@ -83,7 +95,12 @@ ccl_device_inline int4 operator>=(const int4 &a, const int4 &b) # endif } -ccl_device_inline int4 operator&(const int4 &a, const int4 &b) +ccl_device_inline int4 operator>=(const int4 a, const int b) +{ + return a >= make_int4(b); +} + +ccl_device_inline int4 operator&(const int4 a, const int4 b) { # ifdef __KERNEL_SSE__ return int4(_mm_and_si128(a.m128, b.m128)); @@ -92,6 +109,97 @@ ccl_device_inline int4 operator&(const int4 &a, const int4 &b) # endif } +ccl_device_inline int4 operator|(const int4 a, const int4 b) +{ +# ifdef __KERNEL_SSE__ + return int4(_mm_or_si128(a.m128, b.m128)); +# else + return make_int4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); +# endif +} + +ccl_device_inline int4 operator^(const int4 a, const int4 b) +{ +# ifdef __KERNEL_SSE__ + return int4(_mm_xor_si128(a.m128, b.m128)); +# else + return make_int4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +# endif +} + +ccl_device_inline int4 operator&(const int32_t a, const int4 b) +{ + return make_int4(a) & b; +} + +ccl_device_inline int4 operator&(const int4 a, const int32_t b) +{ + return a & make_int4(b); +} + +ccl_device_inline int4 operator|(const int32_t a, const int4 b) +{ + return make_int4(a) | b; +} + +ccl_device_inline int4 operator|(const int4 a, const int32_t b) +{ + return a | make_int4(b); +} + +ccl_device_inline int4 operator^(const int32_t a, const int4 b) +{ + return make_int4(a) ^ b; +} + +ccl_device_inline int4 operator^(const int4 a, const int32_t b) +{ + return a ^ make_int4(b); +} + +ccl_device_inline int4 &operator&=(int4 &a, const int4 b) +{ + return a = a & b; +} +ccl_device_inline int4 &operator&=(int4 &a, const int32_t b) +{ + return a = a & b; +} + +ccl_device_inline int4 &operator|=(int4 &a, const int4 b) +{ + return a = a | b; +} +ccl_device_inline int4 &operator|=(int4 &a, const int32_t b) +{ + return a = a | b; +} + +ccl_device_inline int4 &operator^=(int4 &a, const int4 b) +{ + return a = a ^ b; +} +ccl_device_inline int4 &operator^=(int4 &a, const int32_t b) +{ + return a = a ^ b; +} + +ccl_device_inline int4 &operator<<=(int4 &a, const int32_t b) +{ + return a = a << b; +} +ccl_device_inline int4 &operator>>=(int4 &a, const int32_t b) +{ + return a = a >> b; +} + +# ifdef __KERNEL_SSE__ +ccl_device_forceinline const int4 srl(const int4 a, const int32_t b) +{ + return int4(_mm_srli_epi32(a.m128, b)); +} +# endif + ccl_device_inline int4 min(int4 a, int4 b) { # if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) @@ -110,12 +218,12 @@ ccl_device_inline int4 max(int4 a, int4 b) # endif } -ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx) +ccl_device_inline int4 clamp(const int4 a, const int4 mn, const int4 mx) { return min(max(a, mn), mx); } -ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b) +ccl_device_inline int4 select(const int4 mask, const int4 a, const int4 b) { # ifdef __KERNEL_SSE__ return int4(_mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b))); @@ -135,6 +243,52 @@ ccl_device_inline int4 load_int4(const int *v) } #endif /* __KERNEL_GPU__ */ +ccl_device_inline float4 cast(const int4 a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_castsi128_ps(a)); +#else + return make_float4( + __int_as_float(a.x), __int_as_float(a.y), __int_as_float(a.z), __int_as_float(a.w)); +#endif +} + +#ifdef __KERNEL_SSE__ +ccl_device_forceinline int4 andnot(const int4 a, const int4 b) +{ + return int4(_mm_andnot_si128(a.m128, b.m128)); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> +ccl_device_forceinline int4 shuffle(const int4 a) +{ +# ifdef __KERNEL_NEON__ + int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a)); + return int4(vreinterpretq_m128i_s32(result)); +# else + return int4(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0))); +# endif +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> +ccl_device_forceinline int4 shuffle(const int4 a, const int4 b) +{ +# ifdef __KERNEL_NEON__ + int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a), + vreinterpretq_s32_m128i(b)); + return int4(vreinterpretq_m128i_s32(result)); +# else + return int4(_mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)))); +# endif +} + +template<size_t i0> ccl_device_forceinline int4 shuffle(const int4 b) +{ + return shuffle<i0, i0, i0, i0>(b); +} +#endif + CCL_NAMESPACE_END #endif /* __UTIL_MATH_INT4_H__ */ diff --git a/intern/cycles/util/math_int8.h b/intern/cycles/util/math_int8.h new file mode 100644 index 00000000000..d150b0b74ec --- /dev/null +++ b/intern/cycles/util/math_int8.h @@ -0,0 +1,355 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2013 Intel Corporation + * Copyright 2011-2022 Blender Foundation */ + +#ifndef __UTIL_MATH_INT8_H__ +#define __UTIL_MATH_INT8_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util/types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +ccl_device_inline vint8 operator+(const vint8 a, const vint8 b) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_add_epi32(a.m256, b.m256)); +# else + return make_vint8( + a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h); +# endif +} + +ccl_device_inline vint8 operator+=(vint8 &a, const vint8 b) +{ + return a = a + b; +} + +ccl_device_inline vint8 operator-(const vint8 a, const vint8 b) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_sub_epi32(a.m256, b.m256)); +# else + return make_vint8( + a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h); +# endif +} + +ccl_device_inline vint8 operator-=(vint8 &a, const vint8 b) +{ + return a = a - b; +} + +ccl_device_inline vint8 operator>>(const vint8 a, int i) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_srai_epi32(a.m256, i)); +# else + return make_vint8( + a.a >> i, a.b >> i, a.c >> i, a.d >> i, a.e >> i, a.f >> i, a.g >> i, a.h >> i); +# endif +} + +ccl_device_inline vint8 operator<<(const vint8 a, int i) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_slli_epi32(a.m256, i)); +# else + return make_vint8( + a.a << i, a.b << i, a.c << i, a.d << i, a.e << i, a.f << i, a.g << i, a.h << i); +# endif +} + +ccl_device_inline vint8 operator<(const vint8 a, const vint8 b) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_cmpgt_epi32(b.m256, a.m256)); +# else + return make_vint8( + a.a < b.a, a.b < b.b, a.c < b.c, a.d < b.d, a.e < b.e, a.f < b.f, a.g < b.g, a.h < b.h); +# endif +} + +ccl_device_inline vint8 operator<(const vint8 a, const int b) +{ + return a < make_vint8(b); +} + +ccl_device_inline vint8 operator==(const vint8 a, const vint8 b) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_cmpeq_epi32(a.m256, b.m256)); +# else + return make_vint8(a.a == b.a, + a.b == b.b, + a.c == b.c, + a.d == b.d, + a.e == b.e, + a.f == b.f, + a.g == b.g, + a.h == b.h); +# endif +} + +ccl_device_inline vint8 operator==(const vint8 a, const int b) +{ + return a == make_vint8(b); +} + +ccl_device_inline vint8 operator>=(const vint8 a, const vint8 b) +{ +# ifdef __KERNEL_AVX__ + return vint8( + _mm256_xor_si256(_mm256_set1_epi32(0xffffffff), _mm256_cmpgt_epi32(b.m256, a.m256))); +# else + return make_vint8(a.a >= b.a, + a.b >= b.b, + a.c >= b.c, + a.d >= b.d, + a.e >= b.e, + a.f >= b.f, + a.g >= b.g, + a.h >= b.h); +# endif +} + +ccl_device_inline vint8 operator>=(const vint8 a, const int b) +{ + return a >= make_vint8(b); +} + +ccl_device_inline vint8 operator&(const vint8 a, const vint8 b) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_and_si256(a.m256, b.m256)); +# else + return make_vint8( + a.a & b.a, a.b & b.b, a.c & b.c, a.d & b.d, a.e & b.e, a.f & b.f, a.g & b.g, a.h & b.h); +# endif +} + +ccl_device_inline vint8 operator|(const vint8 a, const vint8 b) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_or_si256(a.m256, b.m256)); +# else + return make_vint8( + a.a | b.a, a.b | b.b, a.c | b.c, a.d | b.d, a.e | b.e, a.f | b.f, a.g | b.g, a.h | b.h); +# endif +} + +ccl_device_inline vint8 operator^(const vint8 a, const vint8 b) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_xor_si256(a.m256, b.m256)); +# else + return make_vint8( + a.a ^ b.a, a.b ^ b.b, a.c ^ b.c, a.d ^ b.d, a.e ^ b.e, a.f ^ b.f, a.g ^ b.g, a.h ^ b.h); +# endif +} + +ccl_device_inline vint8 operator&(const int32_t a, const vint8 b) +{ + return make_vint8(a) & b; +} + +ccl_device_inline vint8 operator&(const vint8 a, const int32_t b) +{ + return a & make_vint8(b); +} + +ccl_device_inline vint8 operator|(const int32_t a, const vint8 b) +{ + return make_vint8(a) | b; +} + +ccl_device_inline vint8 operator|(const vint8 a, const int32_t b) +{ + return a | make_vint8(b); +} + +ccl_device_inline vint8 operator^(const int32_t a, const vint8 b) +{ + return make_vint8(a) ^ b; +} + +ccl_device_inline vint8 operator^(const vint8 a, const int32_t b) +{ + return a ^ make_vint8(b); +} + +ccl_device_inline vint8 &operator&=(vint8 &a, const vint8 b) +{ + return a = a & b; +} +ccl_device_inline vint8 &operator&=(vint8 &a, const int32_t b) +{ + return a = a & b; +} + +ccl_device_inline vint8 &operator|=(vint8 &a, const vint8 b) +{ + return a = a | b; +} +ccl_device_inline vint8 &operator|=(vint8 &a, const int32_t b) +{ + return a = a | b; +} + +ccl_device_inline vint8 &operator^=(vint8 &a, const vint8 b) +{ + return a = a ^ b; +} +ccl_device_inline vint8 &operator^=(vint8 &a, const int32_t b) +{ + return a = a ^ b; +} + +ccl_device_inline vint8 &operator<<=(vint8 &a, const int32_t b) +{ + return a = a << b; +} +ccl_device_inline vint8 &operator>>=(vint8 &a, const int32_t b) +{ + return a = a >> b; +} + +# ifdef __KERNEL_AVX__ +ccl_device_forceinline const vint8 srl(const vint8 a, const int32_t b) +{ + return vint8(_mm256_srli_epi32(a.m256, b)); +} +# endif + +ccl_device_inline vint8 min(vint8 a, vint8 b) +{ +# if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__) + return vint8(_mm256_min_epi32(a.m256, b.m256)); +# else + return make_vint8(min(a.a, b.a), + min(a.b, b.b), + min(a.c, b.c), + min(a.d, b.d), + min(a.e, b.e), + min(a.f, b.f), + min(a.g, b.g), + min(a.h, b.h)); +# endif +} + +ccl_device_inline vint8 max(vint8 a, vint8 b) +{ +# if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__) + return vint8(_mm256_max_epi32(a.m256, b.m256)); +# else + return make_vint8(max(a.a, b.a), + max(a.b, b.b), + max(a.c, b.c), + max(a.d, b.d), + max(a.e, b.e), + max(a.f, b.f), + max(a.g, b.g), + max(a.h, b.h)); +# endif +} + +ccl_device_inline vint8 clamp(const vint8 a, const vint8 mn, const vint8 mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline vint8 select(const vint8 mask, const vint8 a, const vint8 b) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_castps_si256(_mm256_blendv_ps( + _mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)))); +# else + return make_vint8((mask.a) ? a.a : b.a, + (mask.b) ? a.b : b.b, + (mask.c) ? a.c : b.c, + (mask.d) ? a.d : b.d, + (mask.e) ? a.e : b.e, + (mask.f) ? a.f : b.f, + (mask.g) ? a.g : b.g, + (mask.h) ? a.h : b.h); +# endif +} + +ccl_device_inline vint8 load_vint8(const int *v) +{ +# ifdef __KERNEL_AVX__ + return vint8(_mm256_loadu_si256((__m256i *)v)); +# else + return make_vint8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); +# endif +} +#endif /* __KERNEL_GPU__ */ + +ccl_device_inline vfloat8 cast(const vint8 a) +{ +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_castsi256_ps(a)); +#else + return make_vfloat8(__int_as_float(a.a), + __int_as_float(a.b), + __int_as_float(a.c), + __int_as_float(a.d), + __int_as_float(a.e), + __int_as_float(a.f), + __int_as_float(a.g), + __int_as_float(a.h)); +#endif +} + +#ifdef __KERNEL_AVX__ +template<size_t i> ccl_device_forceinline const vint8 shuffle(const vint8 a) +{ + return vint8( + _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i)))); +} + +template<size_t i0, size_t i1> ccl_device_forceinline const vint8 shuffle(const vint8 a) +{ + return vint8(_mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0))); +} + +template<size_t i0, size_t i1> +ccl_device_forceinline const vint8 shuffle(const vint8 a, const vint8 b) +{ + return vint8(_mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0))); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> +ccl_device_forceinline const vint8 shuffle(const vint8 a) +{ + return vint8( + _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0)))); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> +ccl_device_forceinline const vint8 shuffle(const vint8 a, const vint8 b) +{ + return vint8(_mm256_castps_si256(_mm256_shuffle_ps( + _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)))); +} + +template<> __forceinline const vint8 shuffle<0, 0, 2, 2>(const vint8 b) +{ + return vint8(_mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b)))); +} +template<> __forceinline const vint8 shuffle<1, 1, 3, 3>(const vint8 b) +{ + return vint8(_mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b)))); +} +template<> __forceinline const vint8 shuffle<0, 1, 0, 1>(const vint8 b) +{ + return vint8(_mm256_castps_si256( + _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b)))))); +} +#endif + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INT8_H__ */ diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h index aa28682f8c1..0727debf775 100644 --- a/intern/cycles/util/math_intersect.h +++ b/intern/cycles/util/math_intersect.h @@ -133,7 +133,9 @@ ccl_device_forceinline float ray_triangle_rcp(const float x) ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b) { #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0]; + return madd(make_float4(a.x), + make_float4(b.x), + madd(make_float4(a.y), make_float4(b.y), make_float4(a.z) * make_float4(b.z)))[0]; #else return a.x * b.x + a.y * b.y + a.z * b.z; #endif @@ -142,9 +144,10 @@ ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b) ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b) { #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0], - msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0], - msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]); + return make_float3( + msub(make_float4(a.y), make_float4(b.z), make_float4(a.z) * make_float4(b.y))[0], + msub(make_float4(a.z), make_float4(b.x), make_float4(a.x) * make_float4(b.z))[0], + msub(make_float4(a.x), make_float4(b.y), make_float4(a.y) * make_float4(b.x))[0]); #else return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); #endif diff --git a/intern/cycles/util/sseb.h b/intern/cycles/util/sseb.h deleted file mode 100644 index 6f78299711e..00000000000 --- a/intern/cycles/util/sseb.h +++ /dev/null @@ -1,345 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2013 Intel Corporation - * Modifications Copyright 2014-2022 Blender Foundation. */ - -#ifndef __UTIL_SSEB_H__ -#define __UTIL_SSEB_H__ - -CCL_NAMESPACE_BEGIN - -#ifdef __KERNEL_SSE2__ - -struct ssei; -struct ssef; - -/*! 4-wide SSE bool type. */ -struct sseb { - typedef sseb Mask; // mask type - typedef ssei Int; // int type - typedef ssef Float; // float type - - enum { size = 4 }; // number of SIMD elements - union { - __m128 m128; - int32_t v[4]; - }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline sseb() - { - } - __forceinline sseb(const sseb &other) - { - m128 = other.m128; - } - __forceinline sseb &operator=(const sseb &other) - { - m128 = other.m128; - return *this; - } - - __forceinline sseb(const __m128 input) : m128(input) - { - } - __forceinline operator const __m128 &(void) const - { - return m128; - } - __forceinline operator const __m128i(void) const - { - return _mm_castps_si128(m128); - } - __forceinline operator const __m128d(void) const - { - return _mm_castps_pd(m128); - } - - __forceinline sseb(bool a) - : m128(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) - { - } - __forceinline sseb(bool a, bool b) - : m128(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) - { - } - __forceinline sseb(bool a, bool b, bool c, bool d) - : m128(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) - { - } - __forceinline sseb(int mask) - { - assert(mask >= 0 && mask < 16); - m128 = _mm_lookupmask_ps[mask]; - } - - //////////////////////////////////////////////////////////////////////////////// - /// Constants - //////////////////////////////////////////////////////////////////////////////// - - __forceinline sseb(FalseTy) : m128(_mm_setzero_ps()) - { - } - __forceinline sseb(TrueTy) - : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) - { - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline bool operator[](const size_t i) const - { - assert(i < 4); - return (_mm_movemask_ps(m128) >> i) & 1; - } - __forceinline int32_t &operator[](const size_t i) - { - assert(i < 4); - return v[i]; - } -}; - -//////////////////////////////////////////////////////////////////////////////// -/// Unary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const sseb operator!(const sseb &a) -{ - return _mm_xor_ps(a, sseb(True)); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Binary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const sseb operator&(const sseb &a, const sseb &b) -{ - return _mm_and_ps(a, b); -} -__forceinline const sseb operator|(const sseb &a, const sseb &b) -{ - return _mm_or_ps(a, b); -} -__forceinline const sseb operator^(const sseb &a, const sseb &b) -{ - return _mm_xor_ps(a, b); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Assignment Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const sseb operator&=(sseb &a, const sseb &b) -{ - return a = a & b; -} -__forceinline const sseb operator|=(sseb &a, const sseb &b) -{ - return a = a | b; -} -__forceinline const sseb operator^=(sseb &a, const sseb &b) -{ - return a = a ^ b; -} - -//////////////////////////////////////////////////////////////////////////////// -/// Comparison Operators + Select -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const sseb operator!=(const sseb &a, const sseb &b) -{ - return _mm_xor_ps(a, b); -} -__forceinline const sseb operator==(const sseb &a, const sseb &b) -{ - return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); -} - -__forceinline const sseb select(const sseb &m, const sseb &t, const sseb &f) -{ -# if defined(__KERNEL_SSE41__) - return _mm_blendv_ps(f, t, m); -# else - return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); -# endif -} - -//////////////////////////////////////////////////////////////////////////////// -/// Movement/Shifting/Shuffling Functions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const sseb unpacklo(const sseb &a, const sseb &b) -{ - return _mm_unpacklo_ps(a, b); -} -__forceinline const sseb unpackhi(const sseb &a, const sseb &b) -{ - return _mm_unpackhi_ps(a, b); -} - -template<size_t i0, size_t i1, size_t i2, size_t i3> -__forceinline const sseb shuffle(const sseb &a) -{ -# ifdef __KERNEL_NEON__ - return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a); -# else - return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0))); -# endif -} - -# ifndef __KERNEL_NEON__ -template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a) -{ - return _mm_movelh_ps(a, a); -} - -template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a) -{ - return _mm_movehl_ps(a, a); -} -# endif - -template<size_t i0, size_t i1, size_t i2, size_t i3> -__forceinline const sseb shuffle(const sseb &a, const sseb &b) -{ -# ifdef __KERNEL_NEON__ - return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a, b); -# else - return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); -# endif -} - -# ifndef __KERNEL_NEON__ -template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b) -{ - return _mm_movelh_ps(a, b); -} - -template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sseb &b) -{ - return _mm_movehl_ps(b, a); -} -# endif - -# if defined(__KERNEL_SSE3__) && !defined(__KERNEL_NEON__) -template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a) -{ - return _mm_moveldup_ps(a); -} -template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a) -{ - return _mm_movehdup_ps(a); -} -# endif - -# if defined(__KERNEL_SSE41__) -template<size_t dst, size_t src, size_t clr> -__forceinline const sseb insert(const sseb &a, const sseb &b) -{ -# ifdef __KERNEL_NEON__ - sseb res = a; - if (clr) - res[dst] = 0; - else - res[dst] = b[src]; - return res; -# else - return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); -# endif -} -template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b) -{ - return insert<dst, src, 0>(a, b); -} -template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b) -{ - return insert<dst, 0>(a, sseb(b)); -} -# endif - -//////////////////////////////////////////////////////////////////////////////// -/// Reduction Operations -//////////////////////////////////////////////////////////////////////////////// - -# if defined(__KERNEL_SSE41__) -__forceinline uint32_t popcnt(const sseb &a) -{ -# if defined(__KERNEL_NEON__) - const int32x4_t mask = {1, 1, 1, 1}; - int32x4_t t = vandq_s32(vreinterpretq_s32_m128(a.m128), mask); - return vaddvq_s32(t); -# else - return _mm_popcnt_u32(_mm_movemask_ps(a)); -# endif -} -# else -__forceinline uint32_t popcnt(const sseb &a) -{ - return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]); -} -# endif - -__forceinline bool reduce_and(const sseb &a) -{ -# if defined(__KERNEL_NEON__) - return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) == -4; -# else - return _mm_movemask_ps(a) == 0xf; -# endif -} -__forceinline bool reduce_or(const sseb &a) -{ -# if defined(__KERNEL_NEON__) - return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) != 0x0; -# else - return _mm_movemask_ps(a) != 0x0; -# endif -} -__forceinline bool all(const sseb &b) -{ -# if defined(__KERNEL_NEON__) - return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == -4; -# else - return _mm_movemask_ps(b) == 0xf; -# endif -} -__forceinline bool any(const sseb &b) -{ -# if defined(__KERNEL_NEON__) - return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) != 0x0; -# else - return _mm_movemask_ps(b) != 0x0; -# endif -} -__forceinline bool none(const sseb &b) -{ -# if defined(__KERNEL_NEON__) - return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == 0x0; -# else - return _mm_movemask_ps(b) == 0x0; -# endif -} - -__forceinline uint32_t movemask(const sseb &a) -{ - return _mm_movemask_ps(a); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Debug Functions -//////////////////////////////////////////////////////////////////////////////// - -ccl_device_inline void print_sseb(const char *label, const sseb &a) -{ - printf("%s: %d %d %d %d\n", label, a[0], a[1], a[2], a[3]); -} - -#endif - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h deleted file mode 100644 index 1e2bfa90354..00000000000 --- a/intern/cycles/util/ssef.h +++ /dev/null @@ -1,1090 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2013 Intel Corporation - * Modifications Copyright 2014-2022 Blender Foundation. */ - -#ifndef __UTIL_SSEF_H__ -#define __UTIL_SSEF_H__ - -#include <math.h> - -#include "util/ssei.h" - -CCL_NAMESPACE_BEGIN - -#ifdef __KERNEL_SSE2__ - -struct sseb; -struct ssef; - -/*! 4-wide SSE float type. */ -struct ssef { - typedef sseb Mask; // mask type - typedef ssei Int; // int type - typedef ssef Float; // float type - - enum { size = 4 }; // number of SIMD elements - union { - __m128 m128; - float f[4]; - int i[4]; - }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline ssef() - { - } - __forceinline ssef(const ssef &other) - { - m128 = other.m128; - } - __forceinline ssef &operator=(const ssef &other) - { - m128 = other.m128; - return *this; - } - - __forceinline ssef(const __m128 a) : m128(a) - { - } - __forceinline operator const __m128 &() const - { - return m128; - } - __forceinline operator __m128 &() - { - return m128; - } - - __forceinline ssef(float a) : m128(_mm_set1_ps(a)) - { - } - __forceinline ssef(float a, float b, float c, float d) : m128(_mm_setr_ps(a, b, c, d)) - { - } - - __forceinline explicit ssef(const __m128i a) : m128(_mm_cvtepi32_ps(a)) - { - } - - //////////////////////////////////////////////////////////////////////////////// - /// Loads and Stores - //////////////////////////////////////////////////////////////////////////////// - -# if defined(__KERNEL_AVX__) - static __forceinline ssef broadcast(const void *const a) - { - return _mm_broadcast_ss((float *)a); - } -# else - static __forceinline ssef broadcast(const void *const a) - { - return _mm_set1_ps(*(float *)a); - } -# endif - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const float &operator[](const size_t i) const - { - assert(i < 4); - return f[i]; - } - __forceinline float &operator[](const size_t i) - { - assert(i < 4); - return f[i]; - } -}; - -//////////////////////////////////////////////////////////////////////////////// -/// Unary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const ssef cast(const __m128i &a) -{ - return _mm_castsi128_ps(a); -} -__forceinline const ssef operator+(const ssef &a) -{ - return a; -} -__forceinline const ssef operator-(const ssef &a) -{ - return _mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); -} -__forceinline const ssef abs(const ssef &a) -{ - return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); -} -# if defined(__KERNEL_SSE41__) -__forceinline const ssef sign(const ssef &a) -{ - return _mm_blendv_ps(ssef(1.0f), -ssef(1.0f), _mm_cmplt_ps(a, ssef(0.0f))); -} -# endif -__forceinline const ssef signmsk(const ssef &a) -{ - return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); -} - -__forceinline const ssef rcp(const ssef &a) -{ - const ssef r = _mm_rcp_ps(a.m128); - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -} -__forceinline const ssef sqr(const ssef &a) -{ - return _mm_mul_ps(a, a); -} -__forceinline const ssef mm_sqrt(const ssef &a) -{ - return _mm_sqrt_ps(a.m128); -} -__forceinline const ssef rsqrt(const ssef &a) -{ - const ssef r = _mm_rsqrt_ps(a.m128); - return _mm_add_ps( - _mm_mul_ps(_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f), r), - _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set_ps(-0.5f, -0.5f, -0.5f, -0.5f)), r), - _mm_mul_ps(r, r))); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Binary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const ssef operator+(const ssef &a, const ssef &b) -{ - return _mm_add_ps(a.m128, b.m128); -} -__forceinline const ssef operator+(const ssef &a, const float &b) -{ - return a + ssef(b); -} -__forceinline const ssef operator+(const float &a, const ssef &b) -{ - return ssef(a) + b; -} - -__forceinline const ssef operator-(const ssef &a, const ssef &b) -{ - return _mm_sub_ps(a.m128, b.m128); -} -__forceinline const ssef operator-(const ssef &a, const float &b) -{ - return a - ssef(b); -} -__forceinline const ssef operator-(const float &a, const ssef &b) -{ - return ssef(a) - b; -} - -__forceinline const ssef operator*(const ssef &a, const ssef &b) -{ - return _mm_mul_ps(a.m128, b.m128); -} -__forceinline const ssef operator*(const ssef &a, const float &b) -{ - return a * ssef(b); -} -__forceinline const ssef operator*(const float &a, const ssef &b) -{ - return ssef(a) * b; -} - -__forceinline const ssef operator/(const ssef &a, const ssef &b) -{ - return _mm_div_ps(a.m128, b.m128); -} -__forceinline const ssef operator/(const ssef &a, const float &b) -{ - return a / ssef(b); -} -__forceinline const ssef operator/(const float &a, const ssef &b) -{ - return ssef(a) / b; -} - -__forceinline const ssef operator^(const ssef &a, const ssef &b) -{ - return _mm_xor_ps(a.m128, b.m128); -} -__forceinline const ssef operator^(const ssef &a, const ssei &b) -{ - return _mm_xor_ps(a.m128, _mm_castsi128_ps(b.m128)); -} - -__forceinline const ssef operator&(const ssef &a, const ssef &b) -{ - return _mm_and_ps(a.m128, b.m128); -} -__forceinline const ssef operator&(const ssef &a, const ssei &b) -{ - return _mm_and_ps(a.m128, _mm_castsi128_ps(b.m128)); -} - -__forceinline const ssef operator|(const ssef &a, const ssef &b) -{ - return _mm_or_ps(a.m128, b.m128); -} -__forceinline const ssef operator|(const ssef &a, const ssei &b) -{ - return _mm_or_ps(a.m128, _mm_castsi128_ps(b.m128)); -} - -__forceinline const ssef andnot(const ssef &a, const ssef &b) -{ - return _mm_andnot_ps(a.m128, b.m128); -} - -__forceinline const ssef min(const ssef &a, const ssef &b) -{ - return _mm_min_ps(a.m128, b.m128); -} -__forceinline const ssef min(const ssef &a, const float &b) -{ - return _mm_min_ps(a.m128, ssef(b)); -} -__forceinline const ssef min(const float &a, const ssef &b) -{ - return _mm_min_ps(ssef(a), b.m128); -} - -__forceinline const ssef max(const ssef &a, const ssef &b) -{ - return _mm_max_ps(a.m128, b.m128); -} -__forceinline const ssef max(const ssef &a, const float &b) -{ - return _mm_max_ps(a.m128, ssef(b)); -} -__forceinline const ssef max(const float &a, const ssef &b) -{ - return _mm_max_ps(ssef(a), b.m128); -} - -# if defined(__KERNEL_SSE41__) -__forceinline ssef mini(const ssef &a, const ssef &b) -{ - const ssei ai = _mm_castps_si128(a); - const ssei bi = _mm_castps_si128(b); - const ssei ci = _mm_min_epi32(ai, bi); - return _mm_castsi128_ps(ci); -} -# endif - -# if defined(__KERNEL_SSE41__) -__forceinline ssef maxi(const ssef &a, const ssef &b) -{ - const ssei ai = _mm_castps_si128(a); - const ssei bi = _mm_castps_si128(b); - const ssei ci = _mm_max_epi32(ai, bi); - return _mm_castsi128_ps(ci); -} -# endif - -//////////////////////////////////////////////////////////////////////////////// -/// Ternary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c) -{ -# if defined(__KERNEL_NEON__) - return vfmaq_f32(c, a, b); -# elif defined(__KERNEL_AVX2__) - return _mm_fmadd_ps(a, b, c); -# else - return a * b + c; -# endif -} -__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c) -{ -# if defined(__KERNEL_NEON__) - return vfmaq_f32(vnegq_f32(c), a, b); -# elif defined(__KERNEL_AVX2__) - return _mm_fmsub_ps(a, b, c); -# else - return a * b - c; -# endif -} -__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c) -{ -# if defined(__KERNEL_NEON__) - return vfmsq_f32(c, a, b); -# elif defined(__KERNEL_AVX2__) - return _mm_fnmadd_ps(a, b, c); -# else - return c - a * b; -# endif -} -__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c) -{ -# if defined(__KERNEL_NEON__) - return vfmsq_f32(vnegq_f32(c), a, b); -# elif defined(__KERNEL_AVX2__) - return _mm_fnmsub_ps(a, b, c); -# else - return -a * b - c; -# endif -} - -//////////////////////////////////////////////////////////////////////////////// -/// Assignment Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline ssef &operator+=(ssef &a, const ssef &b) -{ - return a = a + b; -} -__forceinline ssef &operator+=(ssef &a, const float &b) -{ - return a = a + b; -} - -__forceinline ssef &operator-=(ssef &a, const ssef &b) -{ - return a = a - b; -} -__forceinline ssef &operator-=(ssef &a, const float &b) -{ - return a = a - b; -} - -__forceinline ssef &operator*=(ssef &a, const ssef &b) -{ - return a = a * b; -} -__forceinline ssef &operator*=(ssef &a, const float &b) -{ - return a = a * b; -} - -__forceinline ssef &operator/=(ssef &a, const ssef &b) -{ - return a = a / b; -} -__forceinline ssef &operator/=(ssef &a, const float &b) -{ - return a = a / b; -} - -//////////////////////////////////////////////////////////////////////////////// -/// Comparison Operators + Select -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const sseb operator==(const ssef &a, const ssef &b) -{ - return _mm_cmpeq_ps(a.m128, b.m128); -} -__forceinline const sseb operator==(const ssef &a, const float &b) -{ - return a == ssef(b); -} -__forceinline const sseb operator==(const float &a, const ssef &b) -{ - return ssef(a) == b; -} - -__forceinline const sseb operator!=(const ssef &a, const ssef &b) -{ - return _mm_cmpneq_ps(a.m128, b.m128); -} -__forceinline const sseb operator!=(const ssef &a, const float &b) -{ - return a != ssef(b); -} -__forceinline const sseb operator!=(const float &a, const ssef &b) -{ - return ssef(a) != b; -} - -__forceinline const sseb operator<(const ssef &a, const ssef &b) -{ - return _mm_cmplt_ps(a.m128, b.m128); -} -__forceinline const sseb operator<(const ssef &a, const float &b) -{ - return a < ssef(b); -} -__forceinline const sseb operator<(const float &a, const ssef &b) -{ - return ssef(a) < b; -} - -__forceinline const sseb operator>=(const ssef &a, const ssef &b) -{ - return _mm_cmpnlt_ps(a.m128, b.m128); -} -__forceinline const sseb operator>=(const ssef &a, const float &b) -{ - return a >= ssef(b); -} -__forceinline const sseb operator>=(const float &a, const ssef &b) -{ - return ssef(a) >= b; -} - -__forceinline const sseb operator>(const ssef &a, const ssef &b) -{ - return _mm_cmpnle_ps(a.m128, b.m128); -} -__forceinline const sseb operator>(const ssef &a, const float &b) -{ - return a > ssef(b); -} -__forceinline const sseb operator>(const float &a, const ssef &b) -{ - return ssef(a) > b; -} - -__forceinline const sseb operator<=(const ssef &a, const ssef &b) -{ - return _mm_cmple_ps(a.m128, b.m128); -} -__forceinline const sseb operator<=(const ssef &a, const float &b) -{ - return a <= ssef(b); -} -__forceinline const sseb operator<=(const float &a, const ssef &b) -{ - return ssef(a) <= b; -} - -__forceinline const ssef select(const sseb &m, const ssef &t, const ssef &f) -{ -# ifdef __KERNEL_SSE41__ - return _mm_blendv_ps(f, t, m); -# else - return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); -# endif -} - -__forceinline const ssef select(const ssef &m, const ssef &t, const ssef &f) -{ -# ifdef __KERNEL_SSE41__ - return _mm_blendv_ps(f, t, m); -# else - return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); -# endif -} - -__forceinline const ssef select(const int mask, const ssef &t, const ssef &f) -{ -# if defined(__KERNEL_SSE41__) && \ - ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER)) - return _mm_blend_ps(f, t, mask); -# else - return select(sseb(mask), t, f); -# endif -} - -//////////////////////////////////////////////////////////////////////////////// -/// Rounding Functions -//////////////////////////////////////////////////////////////////////////////// - -# if defined(__KERNEL_SSE41__) -__forceinline const ssef round_even(const ssef &a) -{ -# ifdef __KERNEL_NEON__ - return vrndnq_f32(a); -# else - return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); -# endif -} -__forceinline const ssef round_down(const ssef &a) -{ -# ifdef __KERNEL_NEON__ - return vrndmq_f32(a); -# else - return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF); -# endif -} -__forceinline const ssef round_up(const ssef &a) -{ -# ifdef __KERNEL_NEON__ - return vrndpq_f32(a); -# else - return _mm_round_ps(a, _MM_FROUND_TO_POS_INF); -# endif -} -__forceinline const ssef round_zero(const ssef &a) -{ -# ifdef __KERNEL_NEON__ - return vrndq_f32(a); -# else - return _mm_round_ps(a, _MM_FROUND_TO_ZERO); -# endif -} -__forceinline const ssef floor(const ssef &a) -{ -# ifdef __KERNEL_NEON__ - return vrndmq_f32(a); -# else - return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF); -# endif -} -__forceinline const ssef ceil(const ssef &a) -{ -# ifdef __KERNEL_NEON__ - return vrndpq_f32(a); -# else - return _mm_round_ps(a, _MM_FROUND_TO_POS_INF); -# endif -} -# else -/* Non-SSE4.1 fallback, needed for floorfrac. */ -__forceinline const ssef floor(const ssef &a) -{ - return _mm_set_ps(floorf(a.f[3]), floorf(a.f[2]), floorf(a.f[1]), floorf(a.f[0])); -} -# endif - -__forceinline ssei truncatei(const ssef &a) -{ - return _mm_cvttps_epi32(a.m128); -} - -__forceinline ssef floorfrac(const ssef &x, ssei *i) -{ - ssef f = floor(x); - *i = truncatei(f); - return x - f; -} - -//////////////////////////////////////////////////////////////////////////////// -/// Common Functions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline ssef mix(const ssef &a, const ssef &b, const ssef &t) -{ - return madd(t, b, (ssef(1.0f) - t) * a); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Movement/Shifting/Shuffling Functions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline ssef unpacklo(const ssef &a, const ssef &b) -{ - return _mm_unpacklo_ps(a.m128, b.m128); -} -__forceinline ssef unpackhi(const ssef &a, const ssef &b) -{ - return _mm_unpackhi_ps(a.m128, b.m128); -} - -template<size_t i0, size_t i1, size_t i2, size_t i3> -__forceinline const ssef shuffle(const ssef &b) -{ -# ifdef __KERNEL_NEON__ - return shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128); -# else - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))); -# endif -} - -template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a) -{ - return _mm_movelh_ps(a, a); -} - -template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a) -{ - return _mm_movehl_ps(a, a); -} - -template<size_t i0, size_t i1, size_t i2, size_t i3> -__forceinline const ssef shuffle(const ssef &a, const ssef &b) -{ -# ifdef __KERNEL_NEON__ - return shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b); -# else - return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); -# endif -} - -template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b) -{ -# ifdef __KERNEL_NEON__ - return shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b); -# else - return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0)); -# endif -} - -# ifndef __KERNEL_NEON__ -template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b) -{ - return _mm_movelh_ps(a, b); -} - -template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const ssef &b) -{ - return _mm_movehl_ps(b, a); -} -# endif - -# if defined(__KERNEL_SSSE3__) -__forceinline const ssef shuffle8(const ssef &a, const ssei &shuf) -{ - return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); -} -# endif - -# if defined(__KERNEL_SSE3__) -template<> __forceinline const ssef shuffle<0, 0, 2, 2>(const ssef &b) -{ - return _mm_moveldup_ps(b); -} -template<> __forceinline const ssef shuffle<1, 1, 3, 3>(const ssef &b) -{ - return _mm_movehdup_ps(b); -} -# endif - -template<size_t i0> __forceinline const ssef shuffle(const ssef &b) -{ - return shuffle<i0, i0, i0, i0>(b); -} - -# if defined(__KERNEL_AVX__) -__forceinline const ssef shuffle(const ssef &a, const ssei &shuf) -{ - return _mm_permutevar_ps(a, shuf); -} -# endif - -template<size_t i> __forceinline float extract(const ssef &a) -{ - return _mm_cvtss_f32(shuffle<i, i, i, i>(a)); -} -template<> __forceinline float extract<0>(const ssef &a) -{ - return _mm_cvtss_f32(a); -} - -# if defined(__KERNEL_SSE41__) -template<size_t dst, size_t src, size_t clr> -__forceinline const ssef insert(const ssef &a, const ssef &b) -{ -# ifdef __KERNEL_NEON__ - ssef res = a; - if (clr) - res[dst] = 0; - else - res[dst] = b[src]; - return res; -# else - return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); -# endif -} -template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b) -{ - return insert<dst, src, 0>(a, b); -} -template<size_t dst> __forceinline const ssef insert(const ssef &a, const float b) -{ - return insert<dst, 0>(a, _mm_set_ss(b)); -} -# else -template<size_t dst> __forceinline const ssef insert(const ssef &a, const float b) -{ - ssef c = a; - c[dst] = b; - return c; -} -# endif - -//////////////////////////////////////////////////////////////////////////////// -/// Transpose -//////////////////////////////////////////////////////////////////////////////// - -__forceinline void transpose(const ssef &r0, - const ssef &r1, - const ssef &r2, - const ssef &r3, - ssef &c0, - ssef &c1, - ssef &c2, - ssef &c3) -{ - ssef l02 = unpacklo(r0, r2); - ssef h02 = unpackhi(r0, r2); - ssef l13 = unpacklo(r1, r3); - ssef h13 = unpackhi(r1, r3); - c0 = unpacklo(l02, l13); - c1 = unpackhi(l02, l13); - c2 = unpacklo(h02, h13); - c3 = unpackhi(h02, h13); -} - -__forceinline void transpose( - const ssef &r0, const ssef &r1, const ssef &r2, const ssef &r3, ssef &c0, ssef &c1, ssef &c2) -{ - ssef l02 = unpacklo(r0, r2); - ssef h02 = unpackhi(r0, r2); - ssef l13 = unpacklo(r1, r3); - ssef h13 = unpackhi(r1, r3); - c0 = unpacklo(l02, l13); - c1 = unpackhi(l02, l13); - c2 = unpacklo(h02, h13); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Reductions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const ssef vreduce_min(const ssef &v) -{ -# ifdef __KERNEL_NEON__ - return vdupq_n_f32(vminvq_f32(v)); -# else - ssef h = min(shuffle<1, 0, 3, 2>(v), v); - return min(shuffle<2, 3, 0, 1>(h), h); -# endif -} -__forceinline const ssef vreduce_max(const ssef &v) -{ -# ifdef __KERNEL_NEON__ - return vdupq_n_f32(vmaxvq_f32(v)); -# else - ssef h = max(shuffle<1, 0, 3, 2>(v), v); - return max(shuffle<2, 3, 0, 1>(h), h); -# endif -} -__forceinline const ssef vreduce_add(const ssef &v) -{ -# ifdef __KERNEL_NEON__ - return vdupq_n_f32(vaddvq_f32(v)); -# else - ssef h = shuffle<1, 0, 3, 2>(v) + v; - return shuffle<2, 3, 0, 1>(h) + h; -# endif -} - -__forceinline float reduce_min(const ssef &v) -{ -# ifdef __KERNEL_NEON__ - return vminvq_f32(v); -# else - return _mm_cvtss_f32(vreduce_min(v)); -# endif -} -__forceinline float reduce_max(const ssef &v) -{ -# ifdef __KERNEL_NEON__ - return vmaxvq_f32(v); -# else - return _mm_cvtss_f32(vreduce_max(v)); -# endif -} -__forceinline float reduce_add(const ssef &v) -{ -# ifdef __KERNEL_NEON__ - return vaddvq_f32(v); -# else - return _mm_cvtss_f32(vreduce_add(v)); -# endif -} - -__forceinline uint32_t select_min(const ssef &v) -{ - return __bsf(movemask(v == vreduce_min(v))); -} -__forceinline uint32_t select_max(const ssef &v) -{ - return __bsf(movemask(v == vreduce_max(v))); -} - -__forceinline uint32_t select_min(const sseb &valid, const ssef &v) -{ - const ssef a = select(valid, v, ssef(pos_inf)); - return __bsf(movemask(valid & (a == vreduce_min(a)))); -} -__forceinline uint32_t select_max(const sseb &valid, const ssef &v) -{ - const ssef a = select(valid, v, ssef(neg_inf)); - return __bsf(movemask(valid & (a == vreduce_max(a)))); -} - -__forceinline uint32_t movemask(const ssef &a) -{ - return _mm_movemask_ps(a); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Memory load and store operations -//////////////////////////////////////////////////////////////////////////////// - -__forceinline ssef load4f(const float4 &a) -{ -# ifdef __KERNEL_WITH_SSE_ALIGN__ - return _mm_load_ps(&a.x); -# else - return _mm_loadu_ps(&a.x); -# endif -} - -__forceinline ssef load4f(const float3 &a) -{ -# ifdef __KERNEL_WITH_SSE_ALIGN__ - return _mm_load_ps(&a.x); -# else - return _mm_loadu_ps(&a.x); -# endif -} - -__forceinline ssef load4f(const void *const a) -{ - return _mm_load_ps((float *)a); -} - -__forceinline ssef load1f_first(const float a) -{ - return _mm_set_ss(a); -} - -__forceinline void store4f(void *ptr, const ssef &v) -{ - _mm_store_ps((float *)ptr, v); -} - -__forceinline ssef loadu4f(const void *const a) -{ - return _mm_loadu_ps((float *)a); -} - -__forceinline void storeu4f(void *ptr, const ssef &v) -{ - _mm_storeu_ps((float *)ptr, v); -} - -__forceinline void store4f(const sseb &mask, void *ptr, const ssef &f) -{ -# if defined(__KERNEL_AVX__) - _mm_maskstore_ps((float *)ptr, (__m128i)mask, f); -# else - *(ssef *)ptr = select(mask, f, *(ssef *)ptr); -# endif -} - -__forceinline ssef load4f_nt(void *ptr) -{ -# if defined(__KERNEL_SSE41__) - return _mm_castsi128_ps(_mm_stream_load_si128((__m128i *)ptr)); -# else - return _mm_load_ps((float *)ptr); -# endif -} - -__forceinline void store4f_nt(void *ptr, const ssef &v) -{ -# if defined(__KERNEL_SSE41__) - _mm_stream_ps((float *)ptr, v); -# else - _mm_store_ps((float *)ptr, v); -# endif -} - -//////////////////////////////////////////////////////////////////////////////// -/// Euclidean Space Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline float dot(const ssef &a, const ssef &b) -{ - return reduce_add(a * b); -} - -/* calculate shuffled cross product, useful when order of components does not matter */ -__forceinline ssef cross_zxy(const ssef &a, const ssef &b) -{ - const ssef a0 = a; - const ssef b0 = shuffle<1, 2, 0, 3>(b); - const ssef a1 = shuffle<1, 2, 0, 3>(a); - const ssef b1 = b; - return msub(a0, b0, a1 * b1); -} - -__forceinline ssef cross(const ssef &a, const ssef &b) -{ - return shuffle<1, 2, 0, 3>(cross_zxy(a, b)); -} - -ccl_device_inline const ssef dot3_splat(const ssef &a, const ssef &b) -{ -# ifdef __KERNEL_SSE41__ - return _mm_dp_ps(a.m128, b.m128, 0x7f); -# else - ssef t = a * b; - return ssef(((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2]); -# endif -} - -/* squared length taking only specified axes into account */ -template<size_t X, size_t Y, size_t Z, size_t W> ccl_device_inline float len_squared(const ssef &a) -{ -# ifndef __KERNEL_SSE41__ - float4 &t = (float4 &)a; - return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) + - (W ? t.w * t.w : 0.0f); -# else - return extract<0>( - ssef(_mm_dp_ps(a.m128, a.m128, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf))); -# endif -} - -ccl_device_inline float dot3(const ssef &a, const ssef &b) -{ -# ifdef __KERNEL_SSE41__ - return extract<0>(ssef(_mm_dp_ps(a.m128, b.m128, 0x7f))); -# else - ssef t = a * b; - return ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2]; -# endif -} - -ccl_device_inline const ssef len3_squared_splat(const ssef &a) -{ - return dot3_splat(a, a); -} - -ccl_device_inline float len3_squared(const ssef &a) -{ - return dot3(a, a); -} - -ccl_device_inline float len3(const ssef &a) -{ - return extract<0>(mm_sqrt(dot3_splat(a, a))); -} - -/* SSE shuffle utility functions */ - -# ifdef __KERNEL_SSSE3__ - -/* faster version for SSSE3 */ -typedef ssei shuffle_swap_t; - -ccl_device_inline shuffle_swap_t shuffle_swap_identity() -{ - return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); -} - -ccl_device_inline shuffle_swap_t shuffle_swap_swap() -{ - return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); -} - -ccl_device_inline const ssef shuffle_swap(const ssef &a, const shuffle_swap_t &shuf) -{ - return cast(_mm_shuffle_epi8(cast(a), shuf)); -} - -# else - -/* somewhat slower version for SSE2 */ -typedef int shuffle_swap_t; - -ccl_device_inline shuffle_swap_t shuffle_swap_identity() -{ - return 0; -} - -ccl_device_inline shuffle_swap_t shuffle_swap_swap() -{ - return 1; -} - -ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf) -{ - /* shuffle value must be a constant, so we need to branch */ - if (shuf) - return shuffle<1, 0, 3, 2>(a); - else - return shuffle<3, 2, 1, 0>(a); -} - -# endif - -# if defined(__KERNEL_SSE41__) && !defined(__KERNEL_NEON__) - -ccl_device_inline void gen_idirsplat_swap(const ssef &pn, - const shuffle_swap_t &shuf_identity, - const shuffle_swap_t &shuf_swap, - const float3 &idir, - ssef idirsplat[3], - shuffle_swap_t shufflexyz[3]) -{ - const __m128 idirsplat_raw[] = {_mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z)}; - idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn); - idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn); - idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn); - - const ssef signmask = cast(ssei(0x80000000)); - const ssef shuf_identity_f = cast(shuf_identity); - const ssef shuf_swap_f = cast(shuf_swap); - - shufflexyz[0] = _mm_castps_si128( - _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask))); - shufflexyz[1] = _mm_castps_si128( - _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask))); - shufflexyz[2] = _mm_castps_si128( - _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask))); -} - -# else - -ccl_device_inline void gen_idirsplat_swap(const ssef &pn, - const shuffle_swap_t &shuf_identity, - const shuffle_swap_t &shuf_swap, - const float3 &idir, - ssef idirsplat[3], - shuffle_swap_t shufflexyz[3]) -{ - idirsplat[0] = ssef(idir.x) ^ pn; - idirsplat[1] = ssef(idir.y) ^ pn; - idirsplat[2] = ssef(idir.z) ^ pn; - - shufflexyz[0] = (idir.x >= 0) ? shuf_identity : shuf_swap; - shufflexyz[1] = (idir.y >= 0) ? shuf_identity : shuf_swap; - shufflexyz[2] = (idir.z >= 0) ? shuf_identity : shuf_swap; -} - -# endif - -ccl_device_inline const ssef uint32_to_float(const ssei &in) -{ - ssei a = _mm_srli_epi32(in, 16); - ssei b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff)); - ssei c = _mm_or_si128(a, _mm_set1_epi32(0x53000000)); - ssef d = _mm_cvtepi32_ps(b); - ssef e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000))); - return _mm_add_ps(e, d); -} - -template<size_t S1, size_t S2, size_t S3, size_t S4> -ccl_device_inline const ssef set_sign_bit(const ssef &a) -{ - return cast(cast(a) ^ ssei(S1 << 31, S2 << 31, S3 << 31, S4 << 31)); -} - -//////////////////////////////////////////////////////////////////////////////// -/// Debug Functions -//////////////////////////////////////////////////////////////////////////////// - -ccl_device_inline void print_ssef(const char *label, const ssef &a) -{ - printf( - "%s: %.8f %.8f %.8f %.8f\n", label, (double)a[0], (double)a[1], (double)a[2], (double)a[3]); -} - -#endif - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/util/ssei.h b/intern/cycles/util/ssei.h deleted file mode 100644 index 5caf44c967f..00000000000 --- a/intern/cycles/util/ssei.h +++ /dev/null @@ -1,633 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2013 Intel Corporation - * Modifications Copyright 2014-2022 Blender Foundation. */ - -#ifndef __UTIL_SSEI_H__ -#define __UTIL_SSEI_H__ - -CCL_NAMESPACE_BEGIN - -#ifdef __KERNEL_SSE2__ - -struct sseb; -struct ssef; - -/*! 4-wide SSE integer type. */ -struct ssei { - typedef sseb Mask; // mask type - typedef ssei Int; // int type - typedef ssef Float; // float type - - enum { size = 4 }; // number of SIMD elements - union { - __m128i m128; - int32_t i[4]; - }; // data - - //////////////////////////////////////////////////////////////////////////////// - /// Constructors, Assignment & Cast Operators - //////////////////////////////////////////////////////////////////////////////// - - __forceinline ssei() - { - } - __forceinline ssei(const ssei &a) - { - m128 = a.m128; - } - __forceinline ssei &operator=(const ssei &a) - { - m128 = a.m128; - return *this; - } - - __forceinline ssei(const __m128i a) : m128(a) - { - } - __forceinline operator const __m128i &(void) const - { - return m128; - } - __forceinline operator __m128i &(void) - { - return m128; - } - - __forceinline ssei(const int a) : m128(_mm_set1_epi32(a)) - { - } - __forceinline ssei(int a, int b, int c, int d) : m128(_mm_setr_epi32(a, b, c, d)) - { - } - - __forceinline explicit ssei(const __m128 a) : m128(_mm_cvtps_epi32(a)) - { - } - - //////////////////////////////////////////////////////////////////////////////// - /// Array Access - //////////////////////////////////////////////////////////////////////////////// - - __forceinline const int32_t &operator[](const size_t index) const - { - assert(index < 4); - return i[index]; - } - __forceinline int32_t &operator[](const size_t index) - { - assert(index < 4); - return i[index]; - } -}; - -//////////////////////////////////////////////////////////////////////////////// -/// Unary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const ssei cast(const __m128 &a) -{ - return _mm_castps_si128(a); -} -__forceinline const ssei operator+(const ssei &a) -{ - return a; -} -__forceinline const ssei operator-(const ssei &a) -{ - return _mm_sub_epi32(_mm_setzero_si128(), a.m128); -} -# if defined(__KERNEL_SSSE3__) -__forceinline const ssei abs(const ssei &a) -{ - return _mm_abs_epi32(a.m128); -} -# endif - -//////////////////////////////////////////////////////////////////////////////// -/// Binary Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const ssei operator+(const ssei &a, const ssei &b) -{ - return _mm_add_epi32(a.m128, b.m128); -} -__forceinline const ssei operator+(const ssei &a, const int32_t &b) -{ - return a + ssei(b); -} -__forceinline const ssei operator+(const int32_t &a, const ssei &b) -{ - return ssei(a) + b; -} - -__forceinline const ssei operator-(const ssei &a, const ssei &b) -{ - return _mm_sub_epi32(a.m128, b.m128); -} -__forceinline const ssei operator-(const ssei &a, const int32_t &b) -{ - return a - ssei(b); -} -__forceinline const ssei operator-(const int32_t &a, const ssei &b) -{ - return ssei(a) - b; -} - -# if defined(__KERNEL_SSE41__) -__forceinline const ssei operator*(const ssei &a, const ssei &b) -{ - return _mm_mullo_epi32(a.m128, b.m128); -} -__forceinline const ssei operator*(const ssei &a, const int32_t &b) -{ - return a * ssei(b); -} -__forceinline const ssei operator*(const int32_t &a, const ssei &b) -{ - return ssei(a) * b; -} -# endif - -__forceinline const ssei operator&(const ssei &a, const ssei &b) -{ - return _mm_and_si128(a.m128, b.m128); -} -__forceinline const ssei operator&(const ssei &a, const int32_t &b) -{ - return a & ssei(b); -} -__forceinline const ssei operator&(const int32_t &a, const ssei &b) -{ - return ssei(a) & b; -} - -__forceinline const ssei operator|(const ssei &a, const ssei &b) -{ - return _mm_or_si128(a.m128, b.m128); -} -__forceinline const ssei operator|(const ssei &a, const int32_t &b) -{ - return a | ssei(b); -} -__forceinline const ssei operator|(const int32_t &a, const ssei &b) -{ - return ssei(a) | b; -} - -__forceinline const ssei operator^(const ssei &a, const ssei &b) -{ - return _mm_xor_si128(a.m128, b.m128); -} -__forceinline const ssei operator^(const ssei &a, const int32_t &b) -{ - return a ^ ssei(b); -} -__forceinline const ssei operator^(const int32_t &a, const ssei &b) -{ - return ssei(a) ^ b; -} - -__forceinline const ssei operator<<(const ssei &a, const int32_t &n) -{ - return _mm_slli_epi32(a.m128, n); -} -__forceinline const ssei operator>>(const ssei &a, const int32_t &n) -{ - return _mm_srai_epi32(a.m128, n); -} - -__forceinline const ssei andnot(const ssei &a, const ssei &b) -{ - return _mm_andnot_si128(a.m128, b.m128); -} -__forceinline const ssei andnot(const sseb &a, const ssei &b) -{ - return _mm_andnot_si128(cast(a.m128), b.m128); -} -__forceinline const ssei andnot(const ssei &a, const sseb &b) -{ - return _mm_andnot_si128(a.m128, cast(b.m128)); -} - -__forceinline const ssei sra(const ssei &a, const int32_t &b) -{ - return _mm_srai_epi32(a.m128, b); -} -__forceinline const ssei srl(const ssei &a, const int32_t &b) -{ - return _mm_srli_epi32(a.m128, b); -} - -# if defined(__KERNEL_SSE41__) -__forceinline const ssei min(const ssei &a, const ssei &b) -{ - return _mm_min_epi32(a.m128, b.m128); -} -__forceinline const ssei min(const ssei &a, const int32_t &b) -{ - return min(a, ssei(b)); -} -__forceinline const ssei min(const int32_t &a, const ssei &b) -{ - return min(ssei(a), b); -} - -__forceinline const ssei max(const ssei &a, const ssei &b) -{ - return _mm_max_epi32(a.m128, b.m128); -} -__forceinline const ssei max(const ssei &a, const int32_t &b) -{ - return max(a, ssei(b)); -} -__forceinline const ssei max(const int32_t &a, const ssei &b) -{ - return max(ssei(a), b); -} -# endif - -//////////////////////////////////////////////////////////////////////////////// -/// Assignment Operators -//////////////////////////////////////////////////////////////////////////////// - -__forceinline ssei &operator+=(ssei &a, const ssei &b) -{ - return a = a + b; -} -__forceinline ssei &operator+=(ssei &a, const int32_t &b) -{ - return a = a + b; -} - -__forceinline ssei &operator-=(ssei &a, const ssei &b) -{ - return a = a - b; -} -__forceinline ssei &operator-=(ssei &a, const int32_t &b) -{ - return a = a - b; -} - -# if defined(__KERNEL_SSE41__) -__forceinline ssei &operator*=(ssei &a, const ssei &b) -{ - return a = a * b; -} -__forceinline ssei &operator*=(ssei &a, const int32_t &b) -{ - return a = a * b; -} -# endif - -__forceinline ssei &operator&=(ssei &a, const ssei &b) -{ - return a = a & b; -} -__forceinline ssei &operator&=(ssei &a, const int32_t &b) -{ - return a = a & b; -} - -__forceinline ssei &operator|=(ssei &a, const ssei &b) -{ - return a = a | b; -} -__forceinline ssei &operator|=(ssei &a, const int32_t &b) -{ - return a = a | b; -} - -__forceinline ssei &operator^=(ssei &a, const ssei &b) -{ - return a = a ^ b; -} -__forceinline ssei &operator^=(ssei &a, const int32_t &b) -{ - return a = a ^ b; -} - -__forceinline ssei &operator<<=(ssei &a, const int32_t &b) -{ - return a = a << b; -} -__forceinline ssei &operator>>=(ssei &a, const int32_t &b) -{ - return a = a >> b; -} - -//////////////////////////////////////////////////////////////////////////////// -/// Comparison Operators + Select -//////////////////////////////////////////////////////////////////////////////// - -__forceinline const sseb operator==(const ssei &a, const ssei &b) -{ - return _mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128)); -} -__forceinline const sseb operator==(const ssei &a, const int32_t &b) -{ - return a == ssei(b); -} -__forceinline const sseb operator==(const int32_t &a, const ssei &b) -{ - return ssei(a) == b; -} - -__forceinline const sseb operator!=(const ssei &a, const ssei &b) -{ - return !(a == b); -} -__forceinline const sseb operator!=(const ssei &a, const int32_t &b) -{ - return a != ssei(b); -} -__forceinline const sseb operator!=(const int32_t &a, const ssei &b) -{ - return ssei(a) != b; -} - -__forceinline const sseb operator<(const ssei &a, const ssei &b) -{ - return _mm_castsi128_ps(_mm_cmplt_epi32(a.m128, b.m128)); -} -__forceinline const sseb operator<(const ssei &a, const int32_t &b) -{ - return a < ssei(b); -} -__forceinline const sseb operator<(const int32_t &a, const ssei &b) -{ - return ssei(a) < b; -} - -__forceinline const sseb operator>=(const ssei &a, const ssei &b) -{ - return !(a < b); -} -__forceinline const sseb operator>=(const ssei &a, const int32_t &b) -{ - return a >= ssei(b); -} -__forceinline const sseb operator>=(const int32_t &a, const ssei &b) -{ - return ssei(a) >= b; -} - -__forceinline const sseb operator>(const ssei &a, const ssei &b) -{ - return _mm_castsi128_ps(_mm_cmpgt_epi32(a.m128, b.m128)); -} -__forceinline const sseb operator>(const ssei &a, const int32_t &b) -{ - return a > ssei(b); -} -__forceinline const sseb operator>(const int32_t &a, const ssei &b) -{ - return ssei(a) > b; -} - -__forceinline const sseb operator<=(const ssei &a, const ssei &b) -{ - return !(a > b); -} -__forceinline const sseb operator<=(const ssei &a, const int32_t &b) -{ - return a <= ssei(b); -} -__forceinline const sseb operator<=(const int32_t &a, const ssei &b) -{ - return ssei(a) <= b; -} - -__forceinline const ssei select(const sseb &m, const ssei &t, const ssei &f) -{ -# ifdef __KERNEL_SSE41__ - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); -# else - return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); -# endif -} - -__forceinline const ssei select(const int mask, const ssei &t, const ssei &f) -{ -# if defined(__KERNEL_SSE41__) && \ - ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER)) - return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); -# else - return select(sseb(mask), t, f); -# endif -} - -//////////////////////////////////////////////////////////////////////////////// -// Movement/Shifting/Shuffling Functions -//////////////////////////////////////////////////////////////////////////////// - -__forceinline ssei unpacklo(const ssei &a, const ssei &b) -{ - return _mm_unpacklo_epi32(a, b); -} -__forceinline ssei unpackhi(const ssei &a, const ssei &b) -{ - return _mm_unpackhi_epi32(a, b); -} - -template<size_t i0, size_t i1, size_t i2, size_t i3> -__forceinline const ssei shuffle(const ssei &a) -{ -# ifdef __KERNEL_NEON__ - int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a)); - return vreinterpretq_m128i_s32(result); -# else - return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)); -# endif -} - -template<size_t i0, size_t i1, size_t i2, size_t i3> -__forceinline const ssei shuffle(const ssei &a, const ssei &b) -{ -# ifdef __KERNEL_NEON__ - int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a), - vreinterpretq_s32_m128i(b)); - return vreinterpretq_m128i_s32(result); -# else - return _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); -# endif -} - -template<size_t i0> __forceinline const ssei shuffle(const ssei &b) -{ - return shuffle<i0, i0, i0, i0>(b); -} - -# if defined(__KERNEL_SSE41__) -template<size_t src> __forceinline int extract(const ssei &b) -{ - return _mm_extract_epi32(b, src); -} -template<size_t dst> __forceinline const ssei insert(const ssei &a, const int32_t b) -{ - return _mm_insert_epi32(a, b, dst); -} -# else -template<size_t src> __forceinline int extract(const ssei &b) -{ - return b[src]; -} -template<size_t dst> __forceinline const ssei insert(const ssei &a, const int32_t b) -{ - ssei c = a; - c[dst] = b; - return c; -} -# endif - -//////////////////////////////////////////////////////////////////////////////// -/// Reductions -//////////////////////////////////////////////////////////////////////////////// - -# if defined(__KERNEL_SSE41__) -__forceinline const ssei vreduce_min(const ssei &v) -{ - ssei h = min(shuffle<1, 0, 3, 2>(v), v); - return min(shuffle<2, 3, 0, 1>(h), h); -} -__forceinline const ssei vreduce_max(const ssei &v) -{ - ssei h = max(shuffle<1, 0, 3, 2>(v), v); - return max(shuffle<2, 3, 0, 1>(h), h); -} -__forceinline const ssei vreduce_add(const ssei &v) -{ - ssei h = shuffle<1, 0, 3, 2>(v) + v; - return shuffle<2, 3, 0, 1>(h) + h; -} - -__forceinline int reduce_min(const ssei &v) -{ -# ifdef __KERNEL_NEON__ - return vminvq_s32(vreinterpretq_s32_m128i(v)); -# else - return extract<0>(vreduce_min(v)); -# endif -} -__forceinline int reduce_max(const ssei &v) -{ -# ifdef __KERNEL_NEON__ - return vmaxvq_s32(vreinterpretq_s32_m128i(v)); -# else - return extract<0>(vreduce_max(v)); -# endif -} -__forceinline int reduce_add(const ssei &v) -{ -# ifdef __KERNEL_NEON__ - return vaddvq_s32(vreinterpretq_s32_m128i(v)); -# else - return extract<0>(vreduce_add(v)); -# endif -} - -__forceinline uint32_t select_min(const ssei &v) -{ - return __bsf(movemask(v == vreduce_min(v))); -} -__forceinline uint32_t select_max(const ssei &v) -{ - return __bsf(movemask(v == vreduce_max(v))); -} - -__forceinline uint32_t select_min(const sseb &valid, const ssei &v) -{ - const ssei a = select(valid, v, ssei((int)pos_inf)); - return __bsf(movemask(valid & (a == vreduce_min(a)))); -} -__forceinline uint32_t select_max(const sseb &valid, const ssei &v) -{ - const ssei a = select(valid, v, ssei((int)neg_inf)); - return __bsf(movemask(valid & (a == vreduce_max(a)))); -} - -# else - -__forceinline int ssei_min(int a, int b) -{ - return (a < b) ? a : b; -} -__forceinline int ssei_max(int a, int b) -{ - return (a > b) ? a : b; -} -__forceinline int reduce_min(const ssei &v) -{ - return ssei_min(ssei_min(v[0], v[1]), ssei_min(v[2], v[3])); -} -__forceinline int reduce_max(const ssei &v) -{ - return ssei_max(ssei_max(v[0], v[1]), ssei_max(v[2], v[3])); -} -__forceinline int reduce_add(const ssei &v) -{ - return v[0] + v[1] + v[2] + v[3]; -} - -# endif - -//////////////////////////////////////////////////////////////////////////////// -/// Memory load and store operations -//////////////////////////////////////////////////////////////////////////////// - -__forceinline ssei load4i(const void *const a) -{ - return _mm_load_si128((__m128i *)a); -} - -__forceinline void store4i(void *ptr, const ssei &v) -{ - _mm_store_si128((__m128i *)ptr, v); -} - -__forceinline void storeu4i(void *ptr, const ssei &v) -{ - _mm_storeu_si128((__m128i *)ptr, v); -} - -__forceinline void store4i(const sseb &mask, void *ptr, const ssei &i) -{ -# if defined(__KERNEL_AVX__) - _mm_maskstore_ps((float *)ptr, (__m128i)mask, _mm_castsi128_ps(i)); -# else - *(ssei *)ptr = select(mask, i, *(ssei *)ptr); -# endif -} - -__forceinline ssei load4i_nt(void *ptr) -{ -# if defined(__KERNEL_SSE41__) - return _mm_stream_load_si128((__m128i *)ptr); -# else - return _mm_load_si128((__m128i *)ptr); -# endif -} - -__forceinline void store4i_nt(void *ptr, const ssei &v) -{ -# if defined(__KERNEL_SSE41__) - _mm_stream_ps((float *)ptr, _mm_castsi128_ps(v)); -# else - _mm_store_si128((__m128i *)ptr, v); -# endif -} - -//////////////////////////////////////////////////////////////////////////////// -/// Debug Functions -//////////////////////////////////////////////////////////////////////////////// - -ccl_device_inline void print_ssei(const char *label, const ssei &a) -{ - printf("%s: %df %df %df %d\n", label, a[0], a[1], a[2], a[3]); -} - -#endif - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp index cb985c65dd8..84116262437 100644 --- a/intern/cycles/util/transform.cpp +++ b/intern/cycles/util/transform.cpp @@ -102,7 +102,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm) return projection_identity(); } - memcpy(&tfmR, R, sizeof(R)); + memcpy(&tfmR.x[0], R, sizeof(R)); return tfmR; } diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h index 24184dc7074..0c39901a63c 100644 --- a/intern/cycles/util/transform.h +++ b/intern/cycles/util/transform.h @@ -63,17 +63,16 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f { /* TODO(sergey): Disabled for now, causes crashes in certain cases. */ #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__) - ssef x, y, z, w, aa; - aa = a.m128; + const float4 aa(a.m128); - x = _mm_loadu_ps(&t->x.x); - y = _mm_loadu_ps(&t->y.x); - z = _mm_loadu_ps(&t->z.x); - w = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); + float4 x(_mm_loadu_ps(&t->x.x)); + float4 y(_mm_loadu_ps(&t->y.x)); + float4 z(_mm_loadu_ps(&t->z.x)); + float4 w(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f)); - _MM_TRANSPOSE4_PS(x, y, z, w); + _MM_TRANSPOSE4_PS(x.m128, y.m128, z.m128, w.m128); - ssef tmp = w; + float4 tmp = w; tmp = madd(shuffle<2>(aa), z, tmp); tmp = madd(shuffle<1>(aa), y, tmp); tmp = madd(shuffle<0>(aa), x, tmp); @@ -94,16 +93,16 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f ccl_device_inline float3 transform_direction(ccl_private const Transform *t, const float3 a) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__) - ssef x, y, z, w, aa; - aa = a.m128; - x = _mm_loadu_ps(&t->x.x); - y = _mm_loadu_ps(&t->y.x); - z = _mm_loadu_ps(&t->z.x); - w = _mm_setzero_ps(); + const float4 aa(a.m128); - _MM_TRANSPOSE4_PS(x, y, z, w); + float4 x(_mm_loadu_ps(&t->x.x)); + float4 y(_mm_loadu_ps(&t->y.x)); + float4 z(_mm_loadu_ps(&t->z.x)); + float4 w(_mm_setzero_ps()); - ssef tmp = shuffle<2>(aa) * z; + _MM_TRANSPOSE4_PS(x.m128, y.m128, z.m128, w.m128); + + float4 tmp = shuffle<2>(aa) * z; tmp = madd(shuffle<1>(aa), y, tmp); tmp = madd(shuffle<0>(aa), x, tmp); @@ -197,14 +196,7 @@ ccl_device_inline Transform make_transform_frame(float3 N) return make_transform(dx.x, dx.y, dx.z, 0.0f, dy.x, dy.y, dy.z, 0.0f, N.x, N.y, N.z, 0.0f); } -#ifndef __KERNEL_GPU__ - -ccl_device_inline Transform transform_zero() -{ - Transform zero = {zero_float4(), zero_float4(), zero_float4()}; - return zero; -} - +#if !defined(__KERNEL_METAL__) ccl_device_inline Transform operator*(const Transform a, const Transform b) { float4 c_x = make_float4(b.x.x, b.y.x, b.z.x, 0.0f); @@ -219,6 +211,15 @@ ccl_device_inline Transform operator*(const Transform a, const Transform b) return t; } +#endif + +#ifndef __KERNEL_GPU__ + +ccl_device_inline Transform transform_zero() +{ + Transform zero = {zero_float4(), zero_float4(), zero_float4()}; + return zero; +} ccl_device_inline void print_transform(const char *label, const Transform &t) { diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h index bb410a6daef..2faac576d82 100644 --- a/intern/cycles/util/transform_inverse.h +++ b/intern/cycles/util/transform_inverse.h @@ -9,26 +9,33 @@ CCL_NAMESPACE_BEGIN * Normally we don't use SSE41/AVX outside the kernel, but for this it's * important to match exactly for ray tracing precision. */ -ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b) +ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const float3 b_) { #if defined(__AVX2__) && defined(__KERNEL_SSE2__) - const ssef sse_a = (const __m128 &)a; - const ssef sse_b = (const __m128 &)b; - const ssef r = shuffle<1, 2, 0, 3>( - ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b))); + const __m128 a = (const __m128 &)a_; + const __m128 b = (const __m128 &)b_; + const __m128 a_shuffle = _mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(3, 0, 2, 1))); + const __m128 b_shuffle = _mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(3, 0, 2, 1))); + const __m128 r = _mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(_mm_fmsub_ps(a, b_shuffle, _mm_mul_ps(a_shuffle, b))), + _MM_SHUFFLE(3, 0, 2, 1))); return (const float3 &)r; #endif - return cross(a, b); + return cross(a_, b_); } -ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b) +ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_) { -#ifdef __SSE4_1__ - return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F)); +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + const __m128 a = (const __m128 &)a_; + const __m128 b = (const __m128 &)b_; + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); #endif - return dot(a, b); + return dot(a_, b_); } ccl_device_forceinline Transform transform_inverse_impl(const Transform tfm) diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h index 1ab6f76f9bc..cf7f35c4116 100644 --- a/intern/cycles/util/types.h +++ b/intern/cycles/util/types.h @@ -97,6 +97,7 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a) #include "util/types_int2.h" #include "util/types_int3.h" #include "util/types_int4.h" +#include "util/types_int8.h" #include "util/types_uint2.h" #include "util/types_uint3.h" @@ -119,6 +120,7 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a) #include "util/types_int2_impl.h" #include "util/types_int3_impl.h" #include "util/types_int4_impl.h" +#include "util/types_int8_impl.h" #include "util/types_uint2_impl.h" #include "util/types_uint3_impl.h" @@ -129,16 +131,4 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a) #include "util/types_float4_impl.h" #include "util/types_float8_impl.h" -/* SSE types. */ -#ifndef __KERNEL_GPU__ -# include "util/sseb.h" -# include "util/ssef.h" -# include "util/ssei.h" -# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) -# include "util/avxb.h" -# include "util/avxf.h" -# include "util/avxi.h" -# endif -#endif - #endif /* __UTIL_TYPES_H__ */ diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h index 29fd632f08e..121141ddfd9 100644 --- a/intern/cycles/util/types_float8.h +++ b/intern/cycles/util/types_float8.h @@ -11,15 +11,15 @@ CCL_NAMESPACE_BEGIN /* float8 is a reserved type in Metal that has not been implemented. For - * that reason this is named float8_t and not using native vector types. */ + * that reason this is named vfloat8 and not using native vector types. */ #ifdef __KERNEL_GPU__ -struct float8_t +struct vfloat8 #else -struct ccl_try_align(32) float8_t +struct ccl_try_align(32) vfloat8 #endif { -#ifdef __KERNEL_AVX2__ +#ifdef __KERNEL_AVX__ union { __m256 m256; struct { @@ -27,18 +27,18 @@ struct ccl_try_align(32) float8_t }; }; - __forceinline float8_t(); - __forceinline float8_t(const float8_t &a); - __forceinline explicit float8_t(const __m256 &a); + __forceinline vfloat8(); + __forceinline vfloat8(const vfloat8 &a); + __forceinline explicit vfloat8(const __m256 &a); __forceinline operator const __m256 &() const; __forceinline operator __m256 &(); - __forceinline float8_t &operator=(const float8_t &a); + __forceinline vfloat8 &operator=(const vfloat8 &a); -#else /* __KERNEL_AVX2__ */ +#else /* __KERNEL_AVX__ */ float a, b, c, d, e, f, g, h; -#endif /* __KERNEL_AVX2__ */ +#endif /* __KERNEL_AVX__ */ #ifndef __KERNEL_GPU__ __forceinline float operator[](int i) const; @@ -46,8 +46,11 @@ struct ccl_try_align(32) float8_t #endif }; -ccl_device_inline float8_t make_float8_t(float f); -ccl_device_inline float8_t -make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h); +ccl_device_inline vfloat8 make_vfloat8(float f); +ccl_device_inline vfloat8 +make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h); +ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b); + +ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a); CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h index e8576cdaf70..9f42e0f663c 100644 --- a/intern/cycles/util/types_float8_impl.h +++ b/intern/cycles/util/types_float8_impl.h @@ -10,45 +10,45 @@ CCL_NAMESPACE_BEGIN -#ifdef __KERNEL_AVX2__ -__forceinline float8_t::float8_t() +#ifdef __KERNEL_AVX__ +__forceinline vfloat8::vfloat8() { } -__forceinline float8_t::float8_t(const float8_t &f) : m256(f.m256) +__forceinline vfloat8::vfloat8(const vfloat8 &f) : m256(f.m256) { } -__forceinline float8_t::float8_t(const __m256 &f) : m256(f) +__forceinline vfloat8::vfloat8(const __m256 &f) : m256(f) { } -__forceinline float8_t::operator const __m256 &() const +__forceinline vfloat8::operator const __m256 &() const { return m256; } -__forceinline float8_t::operator __m256 &() +__forceinline vfloat8::operator __m256 &() { return m256; } -__forceinline float8_t &float8_t::operator=(const float8_t &f) +__forceinline vfloat8 &vfloat8::operator=(const vfloat8 &f) { m256 = f.m256; return *this; } -#endif /* __KERNEL_AVX2__ */ +#endif /* __KERNEL_AVX__ */ #ifndef __KERNEL_GPU__ -__forceinline float float8_t::operator[](int i) const +__forceinline float vfloat8::operator[](int i) const { util_assert(i >= 0); util_assert(i < 8); return *(&a + i); } -__forceinline float &float8_t::operator[](int i) +__forceinline float &vfloat8::operator[](int i) { util_assert(i >= 0); util_assert(i < 8); @@ -56,25 +56,50 @@ __forceinline float &float8_t::operator[](int i) } #endif -ccl_device_inline float8_t make_float8_t(float f) +ccl_device_inline vfloat8 make_vfloat8(float f) { -#ifdef __KERNEL_AVX2__ - float8_t r(_mm256_set1_ps(f)); +#ifdef __KERNEL_AVX__ + vfloat8 r(_mm256_set1_ps(f)); #else - float8_t r = {f, f, f, f, f, f, f, f}; + vfloat8 r = {f, f, f, f, f, f, f, f}; #endif return r; } -ccl_device_inline float8_t -make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h) +ccl_device_inline vfloat8 +make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h) { -#ifdef __KERNEL_AVX2__ - float8_t r(_mm256_setr_ps(a, b, c, d, e, f, g, h)); +#ifdef __KERNEL_AVX__ + vfloat8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h)); #else - float8_t r = {a, b, c, d, e, f, g, h}; + vfloat8 r = {a, b, c, d, e, f, g, h}; #endif return r; } +ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b) +{ +#ifdef __KERNEL_AVX__ + return vfloat8(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1)); +#else + return make_vfloat8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w); +#endif +} + +ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a) +{ +#ifdef __KERNEL_PRINTF__ + printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n", + label, + (double)a.a, + (double)a.b, + (double)a.c, + (double)a.d, + (double)a.e, + (double)a.f, + (double)a.g, + (double)a.h); +#endif +} + CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_int8.h b/intern/cycles/util/types_int8.h new file mode 100644 index 00000000000..8643ebe96ad --- /dev/null +++ b/intern/cycles/util/types_int8.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util/types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +struct vfloat8; + +#ifdef __KERNEL_GPU__ +struct vint8 +#else +struct ccl_try_align(32) vint8 +#endif +{ +#ifdef __KERNEL_AVX__ + union { + __m256i m256; + struct { + int a, b, c, d, e, f, g, h; + }; + }; + + __forceinline vint8(); + __forceinline vint8(const vint8 &a); + __forceinline explicit vint8(const __m256i &a); + + __forceinline operator const __m256i &() const; + __forceinline operator __m256i &(); + + __forceinline vint8 &operator=(const vint8 &a); +#else /* __KERNEL_AVX__ */ + int a, b, c, d, e, f, g, h; +#endif /* __KERNEL_AVX__ */ + +#ifndef __KERNEL_GPU__ + __forceinline int operator[](int i) const; + __forceinline int &operator[](int i); +#endif +}; + +ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h); +ccl_device_inline vint8 make_vint8(int i); +ccl_device_inline vint8 make_vint8(const vfloat8 f); +ccl_device_inline vint8 make_vint8(const int4 a, const int4 b); + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/types_int8_impl.h b/intern/cycles/util/types_int8_impl.h new file mode 100644 index 00000000000..080bcaa6a2b --- /dev/null +++ b/intern/cycles/util/types_int8_impl.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util/types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifdef __KERNEL_AVX__ +__forceinline vint8::vint8() +{ +} + +__forceinline vint8::vint8(const vint8 &a) : m256(a.m256) +{ +} + +__forceinline vint8::vint8(const __m256i &a) : m256(a) +{ +} + +__forceinline vint8::operator const __m256i &() const +{ + return m256; +} + +__forceinline vint8::operator __m256i &() +{ + return m256; +} + +__forceinline vint8 &vint8::operator=(const vint8 &a) +{ + m256 = a.m256; + return *this; +} +#endif /* __KERNEL_AVX__ */ + +#ifndef __KERNEL_GPU__ +__forceinline int vint8::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 8); + return *(&a + i); +} + +__forceinline int &vint8::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 8); + return *(&a + i); +} +#endif + +ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h) +{ +#ifdef __KERNEL_AVX__ + return vint8(_mm256_set_epi32(h, g, f, e, d, c, b, a)); +#else + return {a, b, c, d, e, f, g, h}; +#endif +} + +ccl_device_inline vint8 make_vint8(int i) +{ +#ifdef __KERNEL_AVX__ + return vint8(_mm256_set1_epi32(i)); +#else + return make_vint8(i, i, i, i, i, i, i, i); +#endif +} + +ccl_device_inline vint8 make_vint8(const vfloat8 f) +{ +#ifdef __KERNEL_AVX__ + return vint8(_mm256_cvtps_epi32(f.m256)); +#else + return make_vint8( + (int)f.a, (int)f.b, (int)f.c, (int)f.d, (int)f.e, (int)f.f, (int)f.g, (int)f.h); +#endif +} + +ccl_device_inline vint8 make_vint8(const int4 a, const int4 b) +{ +#ifdef __KERNEL_AVX__ + return vint8(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128), b.m128, 1)); +#else + return make_vint8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w); +#endif +} + +CCL_NAMESPACE_END |