diff options
Diffstat (limited to 'intern')
48 files changed, 958 insertions, 542 deletions
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index 149967ad331..f0540486656 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -138,11 +138,6 @@ endif() blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}") -# avoid link failure with clang 3.4 debug -if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS '3.4') - string(APPEND CMAKE_CXX_FLAGS_DEBUG " -gline-tables-only") -endif() - add_dependencies(bf_intern_cycles bf_rna) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${ADDON_FILES}" ${CYCLES_INSTALL_PATH}) diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 64613216be0..986d5d972c1 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -325,6 +325,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=1024, ) + sample_offset: IntProperty( + name="Sample Offset", + description="Number of samples to skip when starting render", + min=0, max=(1 << 24), + default=0, + ) + time_limit: FloatProperty( name="Time Limit", description="Limit the render time (excluding synchronization time)." diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 0c9179b4ccf..7bac7e46117 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -290,6 +290,9 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel): col.active = not (cscene.use_adaptive_sampling and cscene.use_preview_adaptive_sampling) col.prop(cscene, "sampling_pattern", text="Pattern") + col = layout.column(align=True) + col.prop(cscene, "sample_offset") + layout.separator() col = layout.column(align=True) diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp index 92662e37bc2..59d684dcf05 100644 --- a/intern/cycles/blender/sync.cpp +++ b/intern/cycles/blender/sync.cpp @@ -835,18 +835,25 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, /* samples */ int samples = get_int(cscene, "samples"); int preview_samples = get_int(cscene, "preview_samples"); + int sample_offset = get_int(cscene, "sample_offset"); if (background) { params.samples = samples; + params.sample_offset = sample_offset; } else { params.samples = preview_samples; - if (params.samples == 0) + if (params.samples == 0) { params.samples = INT_MAX; + } + params.sample_offset = 0; } + /* Clamp sample offset. */ + params.sample_offset = clamp(params.sample_offset, 0, Integrator::MAX_SAMPLES); + /* Clamp samples. */ - params.samples = min(params.samples, Integrator::MAX_SAMPLES); + params.samples = clamp(params.samples, 0, Integrator::MAX_SAMPLES - params.sample_offset); /* Viewport Performance */ params.pixel_size = b_engine.get_preview_pixel_size(b_scene); diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp index f11b49ef65f..5aabed8702a 100644 --- a/intern/cycles/device/cpu/device.cpp +++ b/intern/cycles/device/cpu/device.cpp @@ -38,7 +38,6 @@ void device_cpu_info(vector<DeviceInfo> &devices) info.id = "CPU"; info.num = 0; info.has_osl = true; - info.has_half_images = true; info.has_nanovdb = true; info.has_profiling = true; if (openimagedenoise_supported()) { diff --git a/intern/cycles/device/cuda/device.cpp b/intern/cycles/device/cuda/device.cpp index af2bdc6e29c..0d9e6c72466 100644 --- a/intern/cycles/device/cuda/device.cpp +++ b/intern/cycles/device/cuda/device.cpp @@ -144,7 +144,6 @@ void device_cuda_info(vector<DeviceInfo> &devices) info.description = string(name); info.num = num; - info.has_half_images = (major >= 3); info.has_nanovdb = true; info.denoisers = 0; diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index 2bb0592bcc5..95629c57908 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -931,7 +931,6 @@ void CUDADevice::tex_alloc(device_texture &mem) { CUDAContextScope scope(this); - /* General variables for both architectures */ string bind_name = mem.name; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); @@ -1094,7 +1093,6 @@ void CUDADevice::tex_alloc(device_texture &mem) if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { - /* Kepler+, bindless textures. */ CUDA_RESOURCE_DESC resDesc; memset(&resDesc, 0, sizeof(resDesc)); diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 63d0a49d3eb..bfbcdb20d5e 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -286,7 +286,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.description = "Multi Device"; info.num = 0; - info.has_half_images = true; info.has_nanovdb = true; info.has_osl = true; info.has_profiling = true; @@ -333,7 +332,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, } /* Accumulate device info. */ - info.has_half_images &= device.has_half_images; info.has_nanovdb &= device.has_nanovdb; info.has_osl &= device.has_osl; info.has_profiling &= device.has_profiling; diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 65188459c2c..a7d47f23d54 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -73,7 +73,6 @@ class DeviceInfo { int num; bool display_device; /* GPU is used as a display device. */ bool has_nanovdb; /* Support NanoVDB volumes. */ - bool has_half_images; /* Support half-float textures. */ bool has_osl; /* Support Open Shading Language. */ bool has_profiling; /* Supports runtime collection of profiling info. */ bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ @@ -90,7 +89,6 @@ class DeviceInfo { num = 0; cpu_threads = 0; display_device = false; - has_half_images = false; has_nanovdb = false; has_osl = false; has_profiling = false; diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp index 29304e50247..25e932ef080 100644 --- a/intern/cycles/device/hip/device.cpp +++ b/intern/cycles/device/hip/device.cpp @@ -141,7 +141,6 @@ void device_hip_info(vector<DeviceInfo> &devices) info.description = string(name); info.num = num; - info.has_half_images = true; info.has_nanovdb = true; info.denoisers = 0; diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index d7f68934b46..2368925aca5 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -376,10 +376,9 @@ string HIPDevice::compile_kernel(const uint kernel_features, bool HIPDevice::load_kernels(const uint kernel_features) { - /* TODO(sergey): Support kernels re-load for CUDA devices adaptive compile. + /* TODO(sergey): Support kernels re-load for HIP devices adaptive compile. * - * Currently re-loading kernel will invalidate memory pointers, - * causing problems in cuCtxSynchronize. + * Currently re-loading kernels will invalidate memory pointers. */ if (hipModule) { if (use_adaptive_compilation()) { @@ -900,7 +899,6 @@ void HIPDevice::tex_alloc(device_texture &mem) { HIPContextScope scope(this); - /* General variables for both architectures */ string bind_name = mem.name; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); @@ -1065,7 +1063,6 @@ void HIPDevice::tex_alloc(device_texture &mem) if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { - /* Kepler+, bindless textures. */ hipResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); diff --git a/intern/cycles/device/hip/graphics_interop.h b/intern/cycles/device/hip/graphics_interop.h index 8314405e670..71c6893edbd 100644 --- a/intern/cycles/device/hip/graphics_interop.h +++ b/intern/cycles/device/hip/graphics_interop.h @@ -48,7 +48,7 @@ class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop { HIPDeviceQueue *queue_ = nullptr; HIPDevice *device_ = nullptr; - /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */ + /* OpenGL PBO which is currently registered as the destination for the HIP buffer. */ uint opengl_pbo_id_ = 0; /* Buffer area in pixels of the corresponding PBO. */ int64_t buffer_area_ = 0; diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index daf270d6686..dcbdf07ee67 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -380,7 +380,10 @@ void PathTrace::path_trace(RenderWork &render_work) PathTraceWork *path_trace_work = path_trace_works_[i].get(); PathTraceWork::RenderStatistics statistics; - path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples); + path_trace_work->render_samples(statistics, + render_work.path_trace.start_sample, + num_samples, + render_work.path_trace.sample_offset); const double work_time = time_dt() - work_start_time; work_balance_infos_[i].time_spent += work_time; @@ -849,7 +852,8 @@ void PathTrace::progress_update_if_needed(const RenderWork &render_work) const int2 tile_size = get_render_tile_size(); const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples; const int current_sample = render_work.path_trace.start_sample + - render_work.path_trace.num_samples; + render_work.path_trace.num_samples - + render_work.path_trace.sample_offset; progress_->add_samples(num_samples_added, current_sample); } diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h index 0dc7cd2f896..2ebfc913580 100644 --- a/intern/cycles/integrator/path_trace_work.h +++ b/intern/cycles/integrator/path_trace_work.h @@ -75,7 +75,10 @@ class PathTraceWork { /* Render given number of samples as a synchronous blocking call. * The samples are added to the render buffer associated with this work. */ - virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0; + virtual void render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num, + int sample_offset) = 0; /* Copy render result from this work to the corresponding place of the GPU display. * diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp index 36ce2be9f6d..530e60d6750 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.cpp +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -71,7 +71,8 @@ void PathTraceWorkCPU::init_execution() void PathTraceWorkCPU::render_samples(RenderStatistics &statistics, int start_sample, - int samples_num) + int samples_num, + int sample_offset) { const int64_t image_width = effective_buffer_params_.width; const int64_t image_height = effective_buffer_params_.height; @@ -97,6 +98,7 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics, work_tile.w = 1; work_tile.h = 1; work_tile.start_sample = start_sample; + work_tile.sample_offset = sample_offset; work_tile.num_samples = 1; work_tile.offset = effective_buffer_params_.offset; work_tile.stride = effective_buffer_params_.stride; diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h index 6e734690811..63ab686588c 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.h +++ b/intern/cycles/integrator/path_trace_work_cpu.h @@ -48,7 +48,8 @@ class PathTraceWorkCPU : public PathTraceWork { virtual void render_samples(RenderStatistics &statistics, int start_sample, - int samples_num) override; + int samples_num, + int sample_offset) override; virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index b9784f68f56..956aa6a8c90 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -250,7 +250,8 @@ void PathTraceWorkGPU::init_execution() void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, int start_sample, - int samples_num) + int samples_num, + int sample_offset) { /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to * add more work (because tiles are smaller, so there is higher chance that more paths will @@ -261,6 +262,7 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num, + sample_offset, device_scene_->data.integrator.scrambling_distance); enqueue_reset(); diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index c5e291e72db..5aa497c26e7 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -46,7 +46,8 @@ class PathTraceWorkGPU : public PathTraceWork { virtual void render_samples(RenderStatistics &statistics, int start_sample, - int samples_num) override; + int samples_num, + int sample_offset) override; virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp index f776d01ef67..538d751e8b1 100644 --- a/intern/cycles/integrator/render_scheduler.cpp +++ b/intern/cycles/integrator/render_scheduler.cpp @@ -88,6 +88,16 @@ int RenderScheduler::get_num_samples() const return num_samples_; } +void RenderScheduler::set_sample_offset(int sample_offset) +{ + sample_offset_ = sample_offset; +} + +int RenderScheduler::get_sample_offset() const +{ + return sample_offset_; +} + void RenderScheduler::set_time_limit(double time_limit) { time_limit_ = time_limit; @@ -110,13 +120,15 @@ int RenderScheduler::get_num_rendered_samples() const return state_.num_rendered_samples; } -void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples) +void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples, int sample_offset) { buffer_params_ = buffer_params; update_start_resolution_divider(); set_num_samples(num_samples); + set_start_sample(sample_offset); + set_sample_offset(sample_offset); /* In background mode never do lower resolution render preview, as it is not really supported * by the software. */ @@ -171,7 +183,7 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples) void RenderScheduler::reset_for_next_tile() { - reset(buffer_params_, num_samples_); + reset(buffer_params_, num_samples_, sample_offset_); } bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work) @@ -317,6 +329,7 @@ RenderWork RenderScheduler::get_render_work() render_work.path_trace.start_sample = get_start_sample_to_path_trace(); render_work.path_trace.num_samples = get_num_samples_to_path_trace(); + render_work.path_trace.sample_offset = get_sample_offset(); render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample()); diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h index d7b7413ae31..28f563c46e3 100644 --- a/intern/cycles/integrator/render_scheduler.h +++ b/intern/cycles/integrator/render_scheduler.h @@ -39,6 +39,7 @@ class RenderWork { struct { int start_sample = 0; int num_samples = 0; + int sample_offset = 0; } path_trace; struct { @@ -125,6 +126,9 @@ class RenderScheduler { void set_num_samples(int num_samples); int get_num_samples() const; + void set_sample_offset(int sample_offset); + int get_sample_offset() const; + /* Time limit for the path tracing tasks, in minutes. * Zero disables the limit. */ void set_time_limit(double time_limit); @@ -150,7 +154,7 @@ class RenderScheduler { /* Reset scheduler, indicating that rendering will happen from scratch. * Resets current rendered state, as well as scheduling information. */ - void reset(const BufferParams &buffer_params, int num_samples); + void reset(const BufferParams &buffer_params, int num_samples, int sample_offset); /* Reset scheduler upon switching to a next tile. * Will keep the same number of samples and full-frame render parameters, but will reset progress @@ -419,6 +423,8 @@ class RenderScheduler { int start_sample_ = 0; int num_samples_ = 0; + int sample_offset_ = 0; + /* Limit in seconds for how long path tracing is allowed to happen. * Zero means no limit is applied. */ double time_limit_ = 0.0; diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp index 2d1ac07db7f..d60f7149bf4 100644 --- a/intern/cycles/integrator/work_tile_scheduler.cpp +++ b/intern/cycles/integrator/work_tile_scheduler.cpp @@ -36,6 +36,7 @@ void WorkTileScheduler::set_max_num_path_states(int max_num_path_states) void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num, + int sample_offset, float scrambling_distance) { /* Image buffer parameters. */ @@ -51,6 +52,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params, /* Samples parameters. */ sample_start_ = sample_start; samples_num_ = samples_num; + sample_offset_ = sample_offset; /* Initialize new scheduling. */ reset_scheduler_state(); @@ -111,6 +113,7 @@ bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_ work_tile.h = tile_size_.height; work_tile.start_sample = sample_start_ + start_sample; work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample); + work_tile.sample_offset = sample_offset_; work_tile.offset = offset_; work_tile.stride = stride_; diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h index d9fa7e84431..2d6395799f7 100644 --- a/intern/cycles/integrator/work_tile_scheduler.h +++ b/intern/cycles/integrator/work_tile_scheduler.h @@ -41,6 +41,7 @@ class WorkTileScheduler { void reset(const BufferParams &buffer_params, int sample_start, int samples_num, + int sample_offset, float scrambling_distance); /* Get work for a device. @@ -79,6 +80,7 @@ class WorkTileScheduler { * (splitting into a smaller work tiles). */ int sample_start_ = 0; int samples_num_ = 0; + int sample_offset_ = 0; /* Tile size which be scheduled for rendering. */ TileSize tile_size_; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 29ff69df864..39cb886b16e 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -39,6 +39,10 @@ set(SRC_KERNEL_DEVICE_HIP device/hip/kernel.cpp ) +set(SRC_KERNEL_DEVICE_METAL + device/metal/kernel.metal +) + set(SRC_KERNEL_DEVICE_OPTIX device/optix/kernel.cu device/optix/kernel_shader_raytrace.cu @@ -79,6 +83,13 @@ set(SRC_KERNEL_DEVICE_OPTIX_HEADERS device/optix/globals.h ) +set(SRC_KERNEL_DEVICE_METAL_HEADERS + device/metal/compat.h + device/metal/context_begin.h + device/metal/context_end.h + device/metal/globals.h +) + set(SRC_KERNEL_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -723,12 +734,14 @@ cycles_add_library(cycles_kernel "${LIB}" ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_OPTIX} + ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_HEADERS} ${SRC_KERNEL_DEVICE_CPU_HEADERS} ${SRC_KERNEL_DEVICE_GPU_HEADERS} ${SRC_KERNEL_DEVICE_CUDA_HEADERS} ${SRC_KERNEL_DEVICE_HIP_HEADERS} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS} + ${SRC_KERNEL_DEVICE_METAL_HEADERS} ) source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS}) @@ -740,6 +753,7 @@ source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_ source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS}) source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS}) source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}) +source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS}) source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS}) source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS}) source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS}) @@ -772,6 +786,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/film) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator) diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h index 1ee82e6eb7c..2feebad074f 100644 --- a/intern/cycles/kernel/device/cuda/compat.h +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -75,6 +75,7 @@ typedef unsigned long long uint64_t; #define ccl_gpu_block_idx_x (blockIdx.x) #define ccl_gpu_grid_dim_x (gridDim.x) #define ccl_gpu_warp_size (warpSize) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h index 46196dcdb51..003881d7912 100644 --- a/intern/cycles/kernel/device/cuda/config.h +++ b/intern/cycles/kernel/device/cuda/config.h @@ -92,12 +92,29 @@ /* Compute number of threads per block and minimum blocks per multiprocessor * given the maximum number of registers per thread. */ - #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ extern "C" __global__ void __launch_bounds__(block_num_threads, \ GPU_MULTIPRESSOR_MAX_REGISTERS / \ (block_num_threads * thread_num_registers)) +#define ccl_gpu_kernel_threads(block_num_threads) \ + extern "C" __global__ void __launch_bounds__(block_num_threads) + +#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__) + +#define ccl_gpu_kernel_call(x) x + +/* Define a function object where "func" is the lambda body, and additional parameters are used to + * specify captured state */ +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda { \ + __VA_ARGS__; \ + __device__ int operator()(const int state) \ + { \ + return (func); \ + } \ + } ccl_gpu_kernel_lambda_pass + /* sanity checks */ #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h index 95a37c693ae..0900a45c83d 100644 --- a/intern/cycles/kernel/device/gpu/image.h +++ b/intern/cycles/kernel/device/gpu/image.h @@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a) /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */ template<typename T> -ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y) +ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info, + float x, + float y) { ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; @@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f /* Fast tricubic texture lookup using 8 trilinear lookups. */ template<typename T> ccl_device_noinline T -kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z) +kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z) { ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; @@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl template<typename T> ccl_device_noinline T kernel_tex_image_interp_nanovdb( - const TextureInfo &info, float x, float y, float z, uint interpolation) + ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation) { using namespace nanovdb; @@ -191,7 +193,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb( ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y) { - const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id); /* float4, byte4, ushort4 and half4 */ const int texture_type = info.data_type; @@ -226,7 +228,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg, float3 P, InterpolationType interp) { - const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id); if (info.use_transform_3d) { P = transform_point(&info.transform_3d, P); diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 56fcc38b907..dd0c6dd6893 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -21,6 +21,10 @@ #include "kernel/device/gpu/parallel_sorted_index.h" #include "kernel/device/gpu/work_stealing.h" +#ifdef __KERNEL_METAL__ +# include "kernel/device/metal/context_begin.h" +#endif + #include "kernel/integrator/state.h" #include "kernel/integrator/state_flow.h" #include "kernel/integrator/state_util.h" @@ -40,6 +44,11 @@ #include "kernel/bake/bake.h" #include "kernel/film/adaptive_sampling.h" + +#ifdef __KERNEL_METAL__ +# include "kernel/device/metal/context_end.h" +#endif + #include "kernel/film/read.h" /* -------------------------------------------------------------------- @@ -47,7 +56,7 @@ */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_reset(int num_states) + ccl_gpu_kernel_signature(integrator_reset, int num_states) { const int state = ccl_gpu_global_id_x(); @@ -58,10 +67,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles, - const int num_tiles, - float *render_buffer, - const int max_tile_work_size) + ccl_gpu_kernel_signature(integrator_init_from_camera, + ccl_global KernelWorkTile *tiles, + const int num_tiles, + ccl_global float *render_buffer, + const int max_tile_work_size) { const int work_index = ccl_gpu_global_id_x(); @@ -72,7 +82,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int tile_index = work_index / max_tile_work_size; const int tile_work_index = work_index - tile_index * max_tile_work_size; - const KernelWorkTile *tile = &tiles[tile_index]; + ccl_global const KernelWorkTile *tile = &tiles[tile_index]; if (tile_work_index >= tile->work_size) { return; @@ -83,14 +93,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) uint x, y, sample; get_work_pixel(tile, tile_work_index, &x, &y, &sample); - integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample); + ccl_gpu_kernel_call( + integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample)); } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles, - const int num_tiles, - float *render_buffer, - const int max_tile_work_size) + ccl_gpu_kernel_signature(integrator_init_from_bake, + ccl_global KernelWorkTile *tiles, + const int num_tiles, + ccl_global float *render_buffer, + const int max_tile_work_size) { const int work_index = ccl_gpu_global_id_x(); @@ -101,7 +113,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int tile_index = work_index / max_tile_work_size; const int tile_work_index = work_index - tile_index * max_tile_work_size; - const KernelWorkTile *tile = &tiles[tile_index]; + ccl_global const KernelWorkTile *tile = &tiles[tile_index]; if (tile_work_index >= tile->work_size) { return; @@ -112,230 +124,264 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) uint x, y, sample; get_work_pixel(tile, tile_work_index, &x, &y, &sample); - integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample); + ccl_gpu_kernel_call( + integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample)); } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_closest(const int *path_index_array, - ccl_global float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_closest, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_closest(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_shadow, + ccl_global const int *path_index_array, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_shadow(NULL, state); + ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_subsurface, + ccl_global const int *path_index_array, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_subsurface(NULL, state); + ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_volume_stack, + ccl_global const int *path_index_array, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_volume_stack(NULL, state); + ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_background(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_background, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_background(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_light(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_light, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_light(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_shadow(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_shadow, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_shadow(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_surface(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_surface, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_surface(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_surface_raytrace, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_surface_raytrace(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_volume(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_volume, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_volume(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer)); } } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_queued_paths_array(int num_states, - int *indices, - int *num_indices, - int kernel) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_queued_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int kernel_index) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index, + int kernel_index); + ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [kernel](const int state) { - return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_queued_shadow_paths_array(int num_states, - int *indices, - int *num_indices, - int kernel) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int kernel_index) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index, + int kernel_index); + ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [kernel](const int state) { - return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_active_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0); + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [](const int state) { - return (INTEGRATOR_STATE(state, path, queued_kernel) != 0); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_terminated_paths_array(int num_states, - int *indices, - int *num_indices, - int indices_offset) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_terminated_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int indices_offset) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0); + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices + indices_offset, num_indices, [](const int state) { - return (INTEGRATOR_STATE(state, path, queued_kernel) == 0); - }); + num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_terminated_shadow_paths_array(int num_states, - int *indices, - int *num_indices, - int indices_offset) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int indices_offset) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices + indices_offset, num_indices, [](const int state) { - return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); - }); -} - -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_sorted_paths_array(int num_states, - int num_states_limit, - int *indices, - int *num_indices, - int *key_counter, - int *key_prefix_sum, - int kernel) -{ - gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, - num_states_limit, - indices, - num_indices, - key_counter, - key_prefix_sum, - [kernel](const int state) { - return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ? - INTEGRATOR_STATE(state, path, shader_sort_key) : - GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; - }); -} - -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_compact_paths_array(int num_states, - int *indices, - int *num_indices, - int num_active_paths) -{ + num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); +} + +ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_sorted_paths_array, + int num_states, + int num_states_limit, + ccl_global int *indices, + ccl_global int *num_indices, + ccl_global int *key_counter, + ccl_global int *key_prefix_sum, + int kernel_index) +{ + ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ? + INTEGRATOR_STATE(state, path, shader_sort_key) : + GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY, + int kernel_index); + ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; + + const uint state_index = ccl_gpu_global_id_x(); + gpu_parallel_sorted_index_array(state_index, + num_states, + num_states_limit, + indices, + num_indices, + key_counter, + key_prefix_sum, + ccl_gpu_kernel_lambda_pass); +} + +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int num_active_paths) +{ + ccl_gpu_kernel_lambda((state >= num_active_paths) && + (INTEGRATOR_STATE(state, path, queued_kernel) != 0), + int num_active_paths); + ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [num_active_paths](const int state) { - return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_compact_states(const int *active_terminated_states, - const int active_states_offset, - const int terminated_states_offset, - const int work_size) +ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_states, + ccl_global const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); @@ -343,28 +389,32 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B const int from_state = active_terminated_states[active_states_offset + global_index]; const int to_state = active_terminated_states[terminated_states_offset + global_index]; - integrator_state_move(NULL, to_state, from_state); + ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state)); } } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_compact_shadow_paths_array(int num_states, - int *indices, - int *num_indices, - int num_active_paths) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int num_active_paths) { + ccl_gpu_kernel_lambda((state >= num_active_paths) && + (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0), + int num_active_paths); + ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [num_active_paths](const int state) { - return (state >= num_active_paths) && - (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states, - const int active_states_offset, - const int terminated_states_offset, - const int work_size) +ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_shadow_states, + ccl_global const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); @@ -372,15 +422,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B const int from_state = active_terminated_states[active_states_offset + global_index]; const int to_state = active_terminated_states[terminated_states_offset + global_index]; - integrator_shadow_state_move(NULL, to_state, from_state); + ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state)); } } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) - kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values) +ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature( + prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values) { - gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>( - counter, prefix_sum, num_values); + gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values); } /* -------------------------------------------------------------------- @@ -388,16 +437,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer, - int sx, - int sy, - int sw, - int sh, - float threshold, - bool reset, - int offset, - int stride, - uint *num_active_pixels) + ccl_gpu_kernel_signature(adaptive_sampling_convergence_check, + ccl_global float *render_buffer, + int sx, + int sy, + int sw, + int sh, + float threshold, + bool reset, + int offset, + int stride, + ccl_global uint *num_active_pixels) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / sw; @@ -406,37 +456,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) bool converged = true; if (x < sw && y < sh) { - converged = kernel_adaptive_sampling_convergence_check( - nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride); + converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check( + nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride)); } /* NOTE: All threads specified in the mask must execute the intrinsic. */ - const uint num_active_pixels_mask = ccl_gpu_ballot(!converged); + const auto num_active_pixels_mask = ccl_gpu_ballot(!converged); const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; if (lane_id == 0) { - atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask)); + atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_adaptive_sampling_filter_x( - float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride) + ccl_gpu_kernel_signature(adaptive_sampling_filter_x, + ccl_global float *render_buffer, + int sx, + int sy, + int sw, + int sh, + int offset, + int stride) { const int y = ccl_gpu_global_id_x(); if (y < sh) { - kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride); + ccl_gpu_kernel_call( + kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_adaptive_sampling_filter_y( - float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride) + ccl_gpu_kernel_signature(adaptive_sampling_filter_y, + ccl_global float *render_buffer, + int sx, + int sy, + int sw, + int sh, + int offset, + int stride) { const int x = ccl_gpu_global_id_x(); if (x < sw) { - kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride); + ccl_gpu_kernel_call( + kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride)); } } @@ -445,12 +509,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels) + ccl_gpu_kernel_signature(cryptomatte_postprocess, + ccl_global float *render_buffer, + int num_pixels) { const int pixel_index = ccl_gpu_global_id_x(); if (pixel_index < num_pixels) { - kernel_cryptomatte_post(nullptr, render_buffer, pixel_index); + ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index)); } } @@ -458,36 +524,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) * Film. */ -/* Common implementation for float destination. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert, - float *pixels, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int dst_offset, - int dst_stride, - const Processor &processor) -{ - const int render_pixel_index = ccl_gpu_global_id_x(); - if (render_pixel_index >= num_pixels) { - return; - } - - const int x = render_pixel_index % width; - const int y = render_pixel_index / width; - - ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride + - y * stride * kfilm_convert->pass_stride; - - ccl_global float *pixel = pixels + - (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride; - - processor(kfilm_convert, buffer, pixel); -} - ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba, const int rgba_offset, const int rgba_stride, @@ -508,177 +544,95 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb #endif } -/* Common implementation for half4 destination and 4-channel input pass. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba( - const KernelFilmConvert *kfilm_convert, - uchar4 *rgba, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int rgba_offset, - int rgba_stride, - const Processor &processor) -{ - const int render_pixel_index = ccl_gpu_global_id_x(); - if (render_pixel_index >= num_pixels) { - return; - } - - const int x = render_pixel_index % width; - const int y = render_pixel_index / width; - - ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride + - y * stride * kfilm_convert->pass_stride; - - float pixel[4]; - processor(kfilm_convert, buffer, pixel); - - film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel); - - const half4 half_pixel = float4_to_half4_display( - make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); - kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); -} - -/* Common implementation for half4 destination and 3-channel input pass. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb( - const KernelFilmConvert *kfilm_convert, - uchar4 *rgba, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int rgba_offset, - int rgba_stride, - const Processor &processor) -{ - kernel_gpu_film_convert_half_rgba_common_rgba( - kfilm_convert, - rgba, - render_buffer, - num_pixels, - width, - offset, - stride, - rgba_offset, - rgba_stride, - [&processor](const KernelFilmConvert *kfilm_convert, - ccl_global const float *buffer, - float *pixel_rgba) { - processor(kfilm_convert, buffer, pixel_rgba); - pixel_rgba[3] = 1.0f; - }); -} - -/* Common implementation for half4 destination and single channel input pass. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value( - const KernelFilmConvert *kfilm_convert, - uchar4 *rgba, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int rgba_offset, - int rgba_stride, - const Processor &processor) -{ - kernel_gpu_film_convert_half_rgba_common_rgba( - kfilm_convert, - rgba, - render_buffer, - num_pixels, - width, - offset, - stride, - rgba_offset, - rgba_stride, - [&processor](const KernelFilmConvert *kfilm_convert, - ccl_global const float *buffer, - float *pixel_rgba) { - float value; - processor(kfilm_convert, buffer, &value); - - pixel_rgba[0] = value; - pixel_rgba[1] = value; - pixel_rgba[2] = value; - pixel_rgba[3] = 1.0f; - }); -} - -#define KERNEL_FILM_CONVERT_PROC(name) \ - ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name - -#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \ - KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \ - (const KernelFilmConvert kfilm_convert, \ - float *pixels, \ - float *render_buffer, \ - int num_pixels, \ - int width, \ - int offset, \ - int stride, \ - int rgba_offset, \ - int rgba_stride) \ +#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \ + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \ + ccl_gpu_kernel_signature(film_convert_##variant, \ + const KernelFilmConvert kfilm_convert, \ + ccl_global float *pixels, \ + ccl_global float *render_buffer, \ + int num_pixels, \ + int width, \ + int offset, \ + int stride, \ + int rgba_offset, \ + int rgba_stride) \ { \ - kernel_gpu_film_convert_common(&kfilm_convert, \ - pixels, \ - render_buffer, \ - num_pixels, \ - width, \ - offset, \ - stride, \ - rgba_offset, \ - rgba_stride, \ - film_get_pass_pixel_##variant); \ + const int render_pixel_index = ccl_gpu_global_id_x(); \ + if (render_pixel_index >= num_pixels) { \ + return; \ + } \ +\ + const int x = render_pixel_index % width; \ + const int y = render_pixel_index / width; \ +\ + ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \ + y * stride * kfilm_convert.pass_stride; \ +\ + ccl_global float *pixel = pixels + \ + (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \ +\ + film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \ } \ - KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \ - (const KernelFilmConvert kfilm_convert, \ - uchar4 *rgba, \ - float *render_buffer, \ - int num_pixels, \ - int width, \ - int offset, \ - int stride, \ - int rgba_offset, \ - int rgba_stride) \ +\ + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \ + ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \ + const KernelFilmConvert kfilm_convert, \ + ccl_global uchar4 *rgba, \ + ccl_global float *render_buffer, \ + int num_pixels, \ + int width, \ + int offset, \ + int stride, \ + int rgba_offset, \ + int rgba_stride) \ { \ - kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \ - rgba, \ - render_buffer, \ - num_pixels, \ - width, \ - offset, \ - stride, \ - rgba_offset, \ - rgba_stride, \ - film_get_pass_pixel_##variant); \ - } - -KERNEL_FILM_CONVERT_DEFINE(depth, value) -KERNEL_FILM_CONVERT_DEFINE(mist, value) -KERNEL_FILM_CONVERT_DEFINE(sample_count, value) -KERNEL_FILM_CONVERT_DEFINE(float, value) - -KERNEL_FILM_CONVERT_DEFINE(light_path, rgb) -KERNEL_FILM_CONVERT_DEFINE(float3, rgb) - -KERNEL_FILM_CONVERT_DEFINE(motion, rgba) -KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba) -KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba) -KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba) -KERNEL_FILM_CONVERT_DEFINE(combined, rgba) -KERNEL_FILM_CONVERT_DEFINE(float4, rgba) - -#undef KERNEL_FILM_CONVERT_DEFINE -#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE -#undef KERNEL_FILM_CONVERT_PROC + const int render_pixel_index = ccl_gpu_global_id_x(); \ + if (render_pixel_index >= num_pixels) { \ + return; \ + } \ +\ + const int x = render_pixel_index % width; \ + const int y = render_pixel_index / width; \ +\ + ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \ + y * stride * kfilm_convert.pass_stride; \ +\ + float pixel[4]; \ + film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \ +\ + if (input_channel_count == 1) { \ + pixel[1] = pixel[2] = pixel[0]; \ + } \ + if (input_channel_count <= 3) { \ + pixel[3] = 1.0f; \ + } \ +\ + film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \ +\ + const half4 half_pixel = float4_to_half4_display( \ + make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \ + kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \ + } + +/* 1 channel inputs */ +KERNEL_FILM_CONVERT_VARIANT(depth, 1) +KERNEL_FILM_CONVERT_VARIANT(mist, 1) +KERNEL_FILM_CONVERT_VARIANT(sample_count, 1) +KERNEL_FILM_CONVERT_VARIANT(float, 1) + +/* 3 channel inputs */ +KERNEL_FILM_CONVERT_VARIANT(light_path, 3) +KERNEL_FILM_CONVERT_VARIANT(float3, 3) + +/* 4 channel inputs */ +KERNEL_FILM_CONVERT_VARIANT(motion, 4) +KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4) +KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4) +KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4) +KERNEL_FILM_CONVERT_VARIANT(combined, 4) +KERNEL_FILM_CONVERT_VARIANT(float4, 4) + +#undef KERNEL_FILM_CONVERT_VARIANT /* -------------------------------------------------------------------- * Shader evaluation. @@ -687,42 +641,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba) /* Displacement */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input, - float *output, - const int offset, - const int work_size) + ccl_gpu_kernel_signature(shader_eval_displace, + ccl_global KernelShaderEvalInput *input, + ccl_global float *output, + const int offset, + const int work_size) { int i = ccl_gpu_global_id_x(); if (i < work_size) { - kernel_displace_evaluate(NULL, input, output, offset + i); + ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i)); } } /* Background */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_shader_eval_background(KernelShaderEvalInput *input, - float *output, - const int offset, - const int work_size) + ccl_gpu_kernel_signature(shader_eval_background, + ccl_global KernelShaderEvalInput *input, + ccl_global float *output, + const int offset, + const int work_size) { int i = ccl_gpu_global_id_x(); if (i < work_size) { - kernel_background_evaluate(NULL, input, output, offset + i); + ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i)); } } /* Curve Shadow Transparency */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input, - float *output, - const int offset, - const int work_size) + ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency, + ccl_global KernelShaderEvalInput *input, + ccl_global float *output, + const int offset, + const int work_size) { int i = ccl_gpu_global_id_x(); if (i < work_size) { - kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i); + ccl_gpu_kernel_call( + kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i)); } } @@ -731,15 +689,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_color_preprocess(float *render_buffer, - int full_x, - int full_y, - int width, - int height, - int offset, - int stride, - int pass_stride, - int pass_denoised) + ccl_gpu_kernel_signature(filter_color_preprocess, + ccl_global float *render_buffer, + int full_x, + int full_y, + int width, + int height, + int offset, + int stride, + int pass_stride, + int pass_denoised) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / width; @@ -750,31 +709,32 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; - float *buffer = render_buffer + render_pixel_index * pass_stride; + ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride; - float *color_out = buffer + pass_denoised; + ccl_global float *color_out = buffer + pass_denoised; color_out[0] = clamp(color_out[0], 0.0f, 10000.0f); color_out[1] = clamp(color_out[1], 0.0f, 10000.0f); color_out[2] = clamp(color_out[2], 0.0f, 10000.0f); } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_guiding_preprocess(float *guiding_buffer, - int guiding_pass_stride, - int guiding_pass_albedo, - int guiding_pass_normal, - const float *render_buffer, - int render_offset, - int render_stride, - int render_pass_stride, - int render_pass_sample_count, - int render_pass_denoising_albedo, - int render_pass_denoising_normal, - int full_x, - int full_y, - int width, - int height, - int num_samples) + ccl_gpu_kernel_signature(filter_guiding_preprocess, + ccl_global float *guiding_buffer, + int guiding_pass_stride, + int guiding_pass_albedo, + int guiding_pass_normal, + ccl_global const float *render_buffer, + int render_offset, + int render_stride, + int render_pass_stride, + int render_pass_sample_count, + int render_pass_denoising_albedo, + int render_pass_denoising_normal, + int full_x, + int full_y, + int width, + int height, + int num_samples) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / width; @@ -785,10 +745,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t guiding_pixel_index = x + y * width; - float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; + ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride; - const float *buffer = render_buffer + render_pixel_index * render_pass_stride; + ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride; float pixel_scale; if (render_pass_sample_count == PASS_UNUSED) { @@ -802,8 +762,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (guiding_pass_albedo != PASS_UNUSED) { kernel_assert(render_pass_denoising_albedo != PASS_UNUSED); - const float *aledo_in = buffer + render_pass_denoising_albedo; - float *albedo_out = guiding_pixel + guiding_pass_albedo; + ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo; + ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo; albedo_out[0] = aledo_in[0] * pixel_scale; albedo_out[1] = aledo_in[1] * pixel_scale; @@ -814,8 +774,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (guiding_pass_normal != PASS_UNUSED) { kernel_assert(render_pass_denoising_normal != PASS_UNUSED); - const float *normal_in = buffer + render_pass_denoising_normal; - float *normal_out = guiding_pixel + guiding_pass_normal; + ccl_global const float *normal_in = buffer + render_pass_denoising_normal; + ccl_global float *normal_out = guiding_pixel + guiding_pass_normal; normal_out[0] = normal_in[0] * pixel_scale; normal_out[1] = normal_in[1] * pixel_scale; @@ -824,11 +784,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer, - int guiding_pass_stride, - int guiding_pass_albedo, - int width, - int height) + ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo, + ccl_global float *guiding_buffer, + int guiding_pass_stride, + int guiding_pass_albedo, + int width, + int height) { kernel_assert(guiding_pass_albedo != PASS_UNUSED); @@ -841,9 +802,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t guiding_pixel_index = x + y * width; - float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; + ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; - float *albedo_out = guiding_pixel + guiding_pass_albedo; + ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo; albedo_out[0] = 0.5f; albedo_out[1] = 0.5f; @@ -851,20 +812,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_color_postprocess(float *render_buffer, - int full_x, - int full_y, - int width, - int height, - int offset, - int stride, - int pass_stride, - int num_samples, - int pass_noisy, - int pass_denoised, - int pass_sample_count, - int num_components, - bool use_compositing) + ccl_gpu_kernel_signature(filter_color_postprocess, + ccl_global float *render_buffer, + int full_x, + int full_y, + int width, + int height, + int offset, + int stride, + int pass_stride, + int num_samples, + int pass_noisy, + int pass_denoised, + int pass_sample_count, + int num_components, + bool use_compositing) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / width; @@ -875,7 +837,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; - float *buffer = render_buffer + render_pixel_index * pass_stride; + ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride; float pixel_scale; if (pass_sample_count == PASS_UNUSED) { @@ -885,7 +847,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) pixel_scale = __float_as_uint(buffer[pass_sample_count]); } - float *denoised_pixel = buffer + pass_denoised; + ccl_global float *denoised_pixel = buffer + pass_denoised; denoised_pixel[0] *= pixel_scale; denoised_pixel[1] *= pixel_scale; @@ -898,7 +860,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) /* Currently compositing passes are either 3-component (derived by dividing light passes) * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it * simplifies logic and avoids extra memory allocation. */ - const float *noisy_pixel = buffer + pass_noisy; + ccl_global const float *noisy_pixel = buffer + pass_noisy; denoised_pixel[3] = noisy_pixel[3]; } else { @@ -914,21 +876,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states, - uint *num_possible_splits) + ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits, + int num_states, + ccl_global uint *num_possible_splits) { const int state = ccl_gpu_global_id_x(); bool can_split = false; if (state < num_states) { - can_split = kernel_shadow_catcher_path_can_split(nullptr, state); + can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state)); } /* NOTE: All threads specified in the mask must execute the intrinsic. */ - const uint can_split_mask = ccl_gpu_ballot(can_split); + const auto can_split_mask = ccl_gpu_ballot(can_split); const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; if (lane_id == 0) { - atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask)); + atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask)); } } diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index d7416beb783..f667ede2712 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 #endif +#ifdef __KERNEL_METAL__ +struct ActiveIndexContext { + ActiveIndexContext(int _thread_index, + int _global_index, + int _threadgroup_size, + int _simdgroup_size, + int _simd_lane_index, + int _simd_group_index, + int _num_simd_groups, + threadgroup int *_simdgroup_offset) + : thread_index(_thread_index), + global_index(_global_index), + blocksize(_threadgroup_size), + ccl_gpu_warp_size(_simdgroup_size), + thread_warp(_simd_lane_index), + warp_index(_simd_group_index), + num_warps(_num_simd_groups), + warp_offset(_simdgroup_offset) + { + } + + const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index, + num_warps; + threadgroup int *warp_offset; + + template<uint blocksizeDummy, typename IsActiveOp> + void active_index_array(const uint num_states, + ccl_global int *indices, + ccl_global int *num_indices, + IsActiveOp is_active_op) + { + const uint state_index = global_index; +#else template<uint blocksize, typename IsActiveOp> __device__ void gpu_parallel_active_index_array(const uint num_states, - int *indices, - int *num_indices, + ccl_global int *indices, + ccl_global int *num_indices, IsActiveOp is_active_op) { extern ccl_gpu_shared int warp_offset[]; @@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states, const uint warp_index = thread_index / ccl_gpu_warp_size; const uint num_warps = blocksize / ccl_gpu_warp_size; - /* Test if state corresponding to this thread is active. */ const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index; - const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; +#endif - /* For each thread within a warp compute how many other active states precede it. */ - const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp); - const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask); + /* Test if state corresponding to this thread is active. */ + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; - /* Last thread in warp stores number of active states for each warp. */ - if (thread_warp == ccl_gpu_warp_size - 1) { - warp_offset[warp_index] = thread_offset + is_active; - } + /* For each thread within a warp compute how many other active states precede it. */ + const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & + ccl_gpu_thread_mask(thread_warp)); - ccl_gpu_syncthreads(); - - /* Last thread in block converts per-warp sizes to offsets, increments global size of - * index array and gets offset to write to. */ - if (thread_index == blocksize - 1) { - /* TODO: parallelize this. */ - int offset = 0; - for (int i = 0; i < num_warps; i++) { - int num_active = warp_offset[i]; - warp_offset[i] = offset; - offset += num_active; + /* Last thread in warp stores number of active states for each warp. */ + if (thread_warp == ccl_gpu_warp_size - 1) { + warp_offset[warp_index] = thread_offset + is_active; } - const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; - warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); - } + ccl_gpu_syncthreads(); + + /* Last thread in block converts per-warp sizes to offsets, increments global size of + * index array and gets offset to write to. */ + if (thread_index == blocksize - 1) { + /* TODO: parallelize this. */ + int offset = 0; + for (int i = 0; i < num_warps; i++) { + int num_active = warp_offset[i]; + warp_offset[i] = offset; + offset += num_active; + } + + const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; + warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); + } - ccl_gpu_syncthreads(); + ccl_gpu_syncthreads(); - /* Write to index array. */ - if (is_active) { - const uint block_offset = warp_offset[num_warps]; - indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; + /* Write to index array. */ + if (is_active) { + const uint block_offset = warp_offset[num_warps]; + indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; + } } -} + +#ifdef __KERNEL_METAL__ +}; /* end class ActiveIndexContext */ + +/* inject the required thread params into a struct, and redirect to its templated member function + */ +# define gpu_parallel_active_index_array \ + ActiveIndexContext(metal_local_id, \ + metal_global_id, \ + metal_local_size, \ + simdgroup_size, \ + simd_lane_index, \ + simd_group_index, \ + num_simd_groups, \ + simdgroup_offset) \ + .active_index_array +#endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h index 6de3a022569..4bd002c27e4 100644 --- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -33,10 +33,12 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 #endif -template<uint blocksize> -__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values) +__device__ void gpu_parallel_prefix_sum(const int global_id, + ccl_global int *counter, + ccl_global int *prefix_sum, + const int num_values) { - if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) { + if (global_id != 0) { return; } diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h index c06d7be444f..c092e2a21ee 100644 --- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -33,16 +33,16 @@ CCL_NAMESPACE_BEGIN #endif #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) -template<uint blocksize, typename GetKeyOp> -__device__ void gpu_parallel_sorted_index_array(const uint num_states, +template<typename GetKeyOp> +__device__ void gpu_parallel_sorted_index_array(const uint state_index, + const uint num_states, const int num_states_limit, - int *indices, - int *num_indices, - int *key_counter, - int *key_prefix_sum, + ccl_global int *indices, + ccl_global int *num_indices, + ccl_global int *key_counter, + ccl_global int *key_prefix_sum, GetKeyOp get_key_op) { - const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x; const int key = (state_index < num_states) ? get_key_op(state_index) : GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h index 282c3eca641..fb07602539b 100644 --- a/intern/cycles/kernel/device/hip/compat.h +++ b/intern/cycles/kernel/device/hip/compat.h @@ -74,6 +74,7 @@ typedef unsigned long long uint64_t; #define ccl_gpu_block_idx_x (blockIdx.x) #define ccl_gpu_grid_dim_x (gridDim.x) #define ccl_gpu_warp_size (warpSize) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h index 2fde0d46015..7ec744d8ad2 100644 --- a/intern/cycles/kernel/device/hip/config.h +++ b/intern/cycles/kernel/device/hip/config.h @@ -35,12 +35,29 @@ /* Compute number of threads per block and minimum blocks per multiprocessor * given the maximum number of registers per thread. */ - #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ extern "C" __global__ void __launch_bounds__(block_num_threads, \ GPU_MULTIPRESSOR_MAX_REGISTERS / \ (block_num_threads * thread_num_registers)) +#define ccl_gpu_kernel_threads(block_num_threads) \ + extern "C" __global__ void __launch_bounds__(block_num_threads) + +#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__) + +#define ccl_gpu_kernel_call(x) x + +/* Define a function object where "func" is the lambda body, and additional parameters are used to + * specify captured state */ +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda { \ + __VA_ARGS__; \ + __device__ int operator()(const int state) \ + { \ + return (func); \ + } \ + } ccl_gpu_kernel_lambda_pass + /* sanity checks */ #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 77cea30914c..2fa9b7fed44 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -58,6 +58,96 @@ using namespace metal; #define kernel_assert(cond) +#define ccl_gpu_global_id_x() metal_global_id +#define ccl_gpu_warp_size simdgroup_size +#define ccl_gpu_thread_idx_x simd_group_index +#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1) + +#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate))) +#define ccl_gpu_popc(x) popcount(x) + +// clang-format off + +/* kernel.h adapters */ + +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) +#define ccl_gpu_kernel_threads(block_num_threads) + +/* convert a comma-separated list into a semicolon-separated list (so that we can generate a struct based on kernel entrypoint parameters) */ +#define FN0() +#define FN1(p1) p1; +#define FN2(p1, p2) p1; p2; +#define FN3(p1, p2, p3) p1; p2; p3; +#define FN4(p1, p2, p3, p4) p1; p2; p3; p4; +#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5; +#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6; +#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7; +#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8; +#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9; +#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; +#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; +#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; +#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; +#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; +#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; +#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; +#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16 +#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0) + +/* generate a struct containing the entrypoint parameters and a "run" method which can access them implicitly via this-> */ +#define ccl_gpu_kernel_signature(name, ...) \ +struct kernel_gpu_##name \ +{ \ + PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \ + void run(thread MetalKernelContext& context, \ + threadgroup int *simdgroup_offset, \ + const uint metal_global_id, \ + const ushort metal_local_id, \ + const ushort metal_local_size, \ + uint simdgroup_size, \ + uint simd_lane_index, \ + uint simd_group_index, \ + uint num_simd_groups) ccl_global const; \ +}; \ +kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \ + constant KernelParamsMetal &ccl_restrict _launch_params_metal, \ + constant MetalAncillaries *_metal_ancillaries, \ + threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \ + const uint metal_global_id [[thread_position_in_grid]], \ + const ushort metal_local_id [[thread_position_in_threadgroup]], \ + const ushort metal_local_size [[threads_per_threadgroup]], \ + uint simdgroup_size [[threads_per_simdgroup]], \ + uint simd_lane_index [[thread_index_in_simdgroup]], \ + uint simd_group_index [[simdgroup_index_in_threadgroup]], \ + uint num_simd_groups [[simdgroups_per_threadgroup]]) { \ + MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \ + INIT_DEBUG_BUFFER \ + params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \ +} \ +void kernel_gpu_##name::run(thread MetalKernelContext& context, \ + threadgroup int *simdgroup_offset, \ + const uint metal_global_id, \ + const ushort metal_local_id, \ + const ushort metal_local_size, \ + uint simdgroup_size, \ + uint simd_lane_index, \ + uint simd_group_index, \ + uint num_simd_groups) ccl_global const + +#define ccl_gpu_kernel_call(x) context.x + +/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state */ +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda \ + { \ + KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \ + ccl_private MetalKernelContext &context; \ + __VA_ARGS__; \ + int operator()(const int state) const { return (func); } \ + } ccl_gpu_kernel_lambda_pass(context) + +// clang-format on + /* make_type definitions with Metal style element initializers */ #ifdef make_float2 # undef make_float2 @@ -124,3 +214,38 @@ using namespace metal; #define logf(x) trigmode::log(float(x)) #define NULL 0 + +/* texture bindings and sampler setup */ + +struct Texture2DParamsMetal { + texture2d<float, access::sample> tex; +}; +struct Texture3DParamsMetal { + texture3d<float, access::sample> tex; +}; + +struct MetalAncillaries { + device Texture2DParamsMetal *textures_2d; + device Texture3DParamsMetal *textures_3d; +}; + +enum SamplerType { + SamplerFilterNearest_AddressRepeat, + SamplerFilterNearest_AddressClampEdge, + SamplerFilterNearest_AddressClampZero, + + SamplerFilterLinear_AddressRepeat, + SamplerFilterLinear_AddressClampEdge, + SamplerFilterLinear_AddressClampZero, + + SamplerCount +}; + +constant constexpr array<sampler, SamplerCount> metal_samplers = { + sampler(address::repeat, filter::nearest), + sampler(address::clamp_to_edge, filter::nearest), + sampler(address::clamp_to_zero, filter::nearest), + sampler(address::repeat, filter::linear), + sampler(address::clamp_to_edge, filter::linear), + sampler(address::clamp_to_zero, filter::linear), +}; diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h new file mode 100644 index 00000000000..3b5fcdd1f7f --- /dev/null +++ b/intern/cycles/kernel/device/metal/context_begin.h @@ -0,0 +1,79 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// clang-format off + +/* Open the Metal kernel context class + * Necessary to access resource bindings */ +class MetalKernelContext { + public: + constant KernelParamsMetal &launch_params_metal; + constant MetalAncillaries *metal_ancillaries; + + MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries) + : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries) + {} + + /* texture fetch adapter functions */ + typedef uint64_t ccl_gpu_tex_object; + + template<typename T> + inline __attribute__((__always_inline__)) + T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const { + kernel_assert(0); + return 0; + } + template<typename T> + inline __attribute__((__always_inline__)) + T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const { + kernel_assert(0); + return 0; + } + + // texture2d + template<> + inline __attribute__((__always_inline__)) + float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)); + } + template<> + inline __attribute__((__always_inline__)) + float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x; + } + + // texture3d + template<> + inline __attribute__((__always_inline__)) + float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)); + } + template<> + inline __attribute__((__always_inline__)) + float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x; + } +# include "kernel/device/gpu/image.h" + + // clang-format on
\ No newline at end of file diff --git a/intern/cycles/kernel/device/metal/context_end.h b/intern/cycles/kernel/device/metal/context_end.h new file mode 100644 index 00000000000..811abdec150 --- /dev/null +++ b/intern/cycles/kernel/device/metal/context_end.h @@ -0,0 +1,23 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +} +; /* end of MetalKernelContext class definition */ + +/* Silently redirect into the MetalKernelContext instance */ +/* NOTE: These macros will need maintaining as entrypoints change */ + +#undef kernel_integrator_state +#define kernel_integrator_state context.launch_params_metal.__integrator_state diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h new file mode 100644 index 00000000000..b4963518b63 --- /dev/null +++ b/intern/cycles/kernel/device/metal/globals.h @@ -0,0 +1,51 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#include "kernel/types.h" +#include "kernel/util/profiling.h" + +#include "kernel/integrator/state.h" + +CCL_NAMESPACE_BEGIN + +typedef struct KernelParamsMetal { + +#define KERNEL_TEX(type, name) ccl_constant type *name; +#include "kernel/textures.h" +#undef KERNEL_TEX + + const IntegratorStateGPU __integrator_state; + const KernelData data; + +} KernelParamsMetal; + +typedef struct KernelGlobalsGPU { + int unused[1]; +} KernelGlobalsGPU; + +typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals; + +#define kernel_data launch_params_metal.data +#define kernel_integrator_state launch_params_metal.__integrator_state + +/* data lookup defines */ + +#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index] +#define kernel_tex_array(tex) launch_params_metal.tex + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal new file mode 100644 index 00000000000..feca20ff475 --- /dev/null +++ b/intern/cycles/kernel/device/metal/kernel.metal @@ -0,0 +1,25 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Metal kernel entry points */ + +// clang-format off + +#include "kernel/device/metal/compat.h" +#include "kernel/device/metal/globals.h" +#include "kernel/device/gpu/kernel.h" + +// clang-format on
\ No newline at end of file diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h index 835e4621d47..482b921a1a8 100644 --- a/intern/cycles/kernel/device/optix/compat.h +++ b/intern/cycles/kernel/device/optix/compat.h @@ -76,6 +76,7 @@ typedef unsigned long long uint64_t; #define ccl_gpu_block_idx_x (blockIdx.x) #define ccl_gpu_grid_dim_x (gridDim.x) #define ccl_gpu_warp_size (warpSize) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) diff --git a/intern/cycles/kernel/film/accumulate.h b/intern/cycles/kernel/film/accumulate.h index 33f913a6746..8b5c0f2a681 100644 --- a/intern/cycles/kernel/film/accumulate.h +++ b/intern/cycles/kernel/film/accumulate.h @@ -141,7 +141,8 @@ ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer( ccl_device_inline int kernel_accum_sample(KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer, - int sample) + int sample, + int sample_offset) { if (kernel_data.film.pass_sample_count == PASS_UNUSED) { return sample; @@ -149,7 +150,8 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg, ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer); - return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1); + return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1) + + sample_offset; } ccl_device void kernel_accum_adaptive_buffer(KernelGlobals kg, diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h index 4e30563e21b..df1c7fd07e7 100644 --- a/intern/cycles/kernel/integrator/init_from_bake.h +++ b/intern/cycles/kernel/integrator/init_from_bake.h @@ -65,7 +65,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg, } /* Always count the sample, even if the camera sample will reject the ray. */ - const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample); + const int sample = kernel_accum_sample( + kg, state, render_buffer, scheduled_sample, tile->sample_offset); /* Setup render buffers. */ const int index = INTEGRATOR_STATE(state, path, render_pixel_index); diff --git a/intern/cycles/kernel/integrator/init_from_camera.h b/intern/cycles/kernel/integrator/init_from_camera.h index f0ba77bd9a6..59dd1a9fa75 100644 --- a/intern/cycles/kernel/integrator/init_from_camera.h +++ b/intern/cycles/kernel/integrator/init_from_camera.h @@ -89,7 +89,8 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg, * This logic allows to both count actual number of samples per pixel, and to add samples to this * pixel after it was converged and samples were added somewhere else (in which case the * `scheduled_sample` will be different from actual number of samples in this pixel). */ - const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample); + const int sample = kernel_accum_sample( + kg, state, render_buffer, scheduled_sample, tile->sample_offset); /* Initialize random number seed for path. */ const uint rng_hash = path_rng_hash_init(kg, sample, x, y); diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 2827139d511..4e93e82e971 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -1410,6 +1410,7 @@ typedef struct KernelWorkTile { uint start_sample; uint num_samples; + uint sample_offset; int offset; uint stride; diff --git a/intern/cycles/scene/image.cpp b/intern/cycles/scene/image.cpp index 80091e01b8c..8bb2d87fd1e 100644 --- a/intern/cycles/scene/image.cpp +++ b/intern/cycles/scene/image.cpp @@ -303,7 +303,6 @@ ImageManager::ImageManager(const DeviceInfo &info) animation_frame = 0; /* Set image limits */ - features.has_half_float = info.has_half_images; features.has_nanovdb = info.has_nanovdb; } @@ -357,8 +356,6 @@ void ImageManager::load_image_metadata(Image *img) metadata.detect_colorspace(); - assert(features.has_half_float || - (metadata.type != IMAGE_DATA_TYPE_HALF4 && metadata.type != IMAGE_DATA_TYPE_HALF)); assert(features.has_nanovdb || (metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT || metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3)); diff --git a/intern/cycles/scene/image.h b/intern/cycles/scene/image.h index 6447b028ebf..7cf09dd6d8f 100644 --- a/intern/cycles/scene/image.h +++ b/intern/cycles/scene/image.h @@ -100,7 +100,6 @@ class ImageMetaData { /* Information about supported features that Image loaders can use. */ class ImageDeviceFeatures { public: - bool has_half_float; bool has_nanovdb; }; diff --git a/intern/cycles/scene/image_oiio.cpp b/intern/cycles/scene/image_oiio.cpp index feafae035a1..4cea7fbfb01 100644 --- a/intern/cycles/scene/image_oiio.cpp +++ b/intern/cycles/scene/image_oiio.cpp @@ -30,7 +30,8 @@ OIIOImageLoader::~OIIOImageLoader() { } -bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMetaData &metadata) +bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures & /*features*/, + ImageMetaData &metadata) { /* Perform preliminary checks, with meaningful logging. */ if (!path_exists(filepath.string())) { @@ -76,7 +77,7 @@ bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMe } /* check if it's half float */ - if (spec.format == TypeDesc::HALF && features.has_half_float) { + if (spec.format == TypeDesc::HALF) { is_half = true; } diff --git a/intern/cycles/session/session.cpp b/intern/cycles/session/session.cpp index b228939689c..299d731d9f9 100644 --- a/intern/cycles/session/session.cpp +++ b/intern/cycles/session/session.cpp @@ -262,6 +262,7 @@ RenderWork Session::run_update_for_next_iteration() } render_scheduler_.set_num_samples(params.samples); + render_scheduler_.set_start_sample(params.sample_offset); render_scheduler_.set_time_limit(params.time_limit); while (have_tiles) { @@ -397,7 +398,7 @@ void Session::do_delayed_reset() /* Tile and work scheduling. */ tile_manager_.reset_scheduling(buffer_params_, get_effective_tile_size()); - render_scheduler_.reset(buffer_params_, params.samples); + render_scheduler_.reset(buffer_params_, params.samples, params.sample_offset); /* Passes. */ /* When multiple tiles are used SAMPLE_COUNT pass is used to keep track of possible partial diff --git a/intern/cycles/session/session.h b/intern/cycles/session/session.h index 1ec0c6e9bb1..3f73593f008 100644 --- a/intern/cycles/session/session.h +++ b/intern/cycles/session/session.h @@ -54,6 +54,7 @@ class SessionParams { bool experimental; int samples; + int sample_offset; int pixel_size; int threads; @@ -75,6 +76,7 @@ class SessionParams { experimental = false; samples = 1024; + sample_offset = 0; pixel_size = 1; threads = 0; time_limit = 0.0; diff --git a/intern/locale/boost_locale_wrapper.cpp b/intern/locale/boost_locale_wrapper.cpp index ede9377b38f..444b51b5e04 100644 --- a/intern/locale/boost_locale_wrapper.cpp +++ b/intern/locale/boost_locale_wrapper.cpp @@ -26,8 +26,8 @@ static std::string messages_path; static std::string default_domain; static std::string locale_str; -/* Note: We cannot use short stuff like boost::locale::gettext, because those return - * std::basic_string objects, which c_ptr()-returned char* is no more valid +/* NOTE: We cannot use short stuff like `boost::locale::gettext`, because those return + * `std::basic_string` objects, which c_ptr()-returned char* is no more valid * once deleted (which happens as soons they are out of scope of this func). */ typedef boost::locale::message_format<char> char_message_facet; static std::locale locale_global; @@ -63,7 +63,7 @@ static void bl_locale_global_cache() void bl_locale_init(const char *_messages_path, const char *_default_domain) { - // Avoid using ICU backend, we do not need its power and it's rather heavy! + /* Avoid using ICU backend, we do not need its power and it's rather heavy! */ boost::locale::localization_backend_manager lman = boost::locale::localization_backend_manager::global(); #if defined(_WIN32) @@ -81,7 +81,7 @@ void bl_locale_set(const char *locale) { boost::locale::generator gen; std::locale _locale; - // Specify location of dictionaries. + /* Specify location of dictionaries. */ gen.add_messages_path(messages_path); gen.add_messages_domain(default_domain); // gen.set_default_messages_domain(default_domain); @@ -99,12 +99,12 @@ void bl_locale_set(const char *locale) #endif } std::locale::global(_locale); - // Note: boost always uses "C" LC_NUMERIC by default! + /* NOTE: boost always uses "C" LC_NUMERIC by default! */ bl_locale_global_cache(); - // Generate the locale string - // (useful to know which locale we are actually using in case of "default" one). + /* Generate the locale string + * (useful to know which locale we are actually using in case of "default" one). */ #define LOCALE_INFO std::use_facet<boost::locale::info>(_locale) locale_str = LOCALE_INFO.language(); @@ -117,10 +117,9 @@ void bl_locale_set(const char *locale) #undef LOCALE_INFO } - // Extra catch on `std::runtime_error` is needed for macOS/Clang as it seems that exceptions - // like `boost::locale::conv::conversion_error` (which inherit from `std::runtime_error`) are - // not caught by their ancestor `std::exception`. See - // https://developer.blender.org/T88877#1177108 . + /* Extra catch on `std::runtime_error` is needed for macOS/Clang as it seems that exceptions + * like `boost::locale::conv::conversion_error` (which inherit from `std::runtime_error`) are + * not caught by their ancestor `std::exception`. See T88877#1177108 */ catch (std::runtime_error const &e) { std::cout << "bl_locale_set(" << locale << "): " << e.what() << " \n"; } |