diff options
18 files changed, 178 insertions, 97 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 18aa5dda70d..a4788c437a1 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -78,6 +78,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device, integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE), integrator_shader_raytrace_sort_counter_( device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE), + integrator_shader_sort_prefix_sum_( + device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE), integrator_next_shadow_path_index_( device, "integrator_next_shadow_path_index", MEM_READ_WRITE), integrator_next_shadow_catcher_path_index_( @@ -200,6 +202,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting() integrator_shader_raytrace_sort_counter_.alloc(max_shaders); integrator_shader_raytrace_sort_counter_.zero_to_device(); + integrator_shader_sort_prefix_sum_.alloc(max_shaders); + integrator_shader_sort_prefix_sum_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = (int *)integrator_shader_sort_counter_.device_pointer; integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = @@ -374,9 +379,12 @@ bool PathTraceWorkGPU::enqueue_path_iteration() /* For kernels that add shadow paths, check if there is enough space available. * If not, schedule shadow kernels first to clear out the shadow paths. */ + int num_paths_limit = INT_MAX; + if (kernel_creates_shadow_paths(kernel)) { - if (max_num_paths_ - integrator_next_shadow_path_index_.data()[0] < - queue_counter->num_queued[kernel]) { + const int available_shadow_paths = max_num_paths_ - + integrator_next_shadow_path_index_.data()[0]; + if (available_shadow_paths < queue_counter->num_queued[kernel]) { if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) { enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); return true; @@ -386,10 +394,14 @@ bool PathTraceWorkGPU::enqueue_path_iteration() return true; } } + else if (kernel_creates_ao_paths(kernel)) { + /* AO kernel creates two shadow paths, so limit number of states to schedule. */ + num_paths_limit = available_shadow_paths / 2; + } } /* Schedule kernel with maximum number of queued items. */ - enqueue_path_iteration(kernel); + enqueue_path_iteration(kernel, num_paths_limit); /* Update next shadow path index for kernels that can add shadow paths. */ if (kernel_creates_shadow_paths(kernel)) { @@ -399,7 +411,7 @@ bool PathTraceWorkGPU::enqueue_path_iteration() return true; } -void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel) +void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit) { void *d_path_index = (void *)NULL; @@ -414,7 +426,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel) work_size = num_queued; d_path_index = (void *)queued_paths_.device_pointer; - compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel); + compute_sorted_queued_paths( + DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit); } else if (num_queued < work_size) { work_size = num_queued; @@ -430,6 +443,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel) } } + work_size = min(work_size, num_paths_limit); + DCHECK_LE(work_size, max_num_paths_); switch (kernel) { @@ -464,17 +479,20 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel) } } -void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel) +void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, + DeviceKernel queued_kernel, + const int num_paths_limit) { int d_queued_kernel = queued_kernel; void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel]; - assert(d_counter != nullptr); + void *d_prefix_sum = (void *)integrator_shader_sort_prefix_sum_.device_pointer; + assert(d_counter != nullptr && d_prefix_sum != nullptr); /* Compute prefix sum of number of active paths with each shader. */ { const int work_size = 1; int max_shaders = device_scene_->data.max_shaders; - void *args[] = {&d_counter, &max_shaders}; + void *args[] = {&d_counter, &d_prefix_sum, &max_shaders}; queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args); } @@ -483,29 +501,24 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe /* Launch kernel to fill the active paths arrays. */ { /* TODO: this could be smaller for terminated paths based on amount of work we want - * to schedule. */ + * to schedule, and also based on num_paths_limit. + * + * Also, when the number paths is limited it may be better to prefer paths from the + * end of the array since compaction would need to do less work. */ const int work_size = kernel_max_active_path_index(queued_kernel); void *d_queued_paths = (void *)queued_paths_.device_pointer; void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; void *args[] = {const_cast<int *>(&work_size), + const_cast<int *>(&num_paths_limit), &d_queued_paths, &d_num_queued_paths, &d_counter, + &d_prefix_sum, &d_queued_kernel}; queue_->enqueue(kernel, work_size, args); } - - if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) { - queue_->zero_to_device(integrator_shader_sort_counter_); - } - else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { - queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); - } - else { - assert(0); - } } void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel) @@ -1026,6 +1039,13 @@ bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel) kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME); } +bool PathTraceWorkGPU::kernel_creates_ao_paths(DeviceKernel kernel) +{ + return (device_scene_->data.film.pass_ao != PASS_UNUSED) && + (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE); +} + bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel) { return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index dd2c1c197ae..e1f6c09d334 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -79,10 +79,12 @@ class PathTraceWorkGPU : public PathTraceWork { const int num_predicted_splits); bool enqueue_path_iteration(); - void enqueue_path_iteration(DeviceKernel kernel); + void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX); void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel); - void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel); + void compute_sorted_queued_paths(DeviceKernel kernel, + DeviceKernel queued_kernel, + const int num_paths_limit); void compact_states(const int num_active_paths); @@ -116,6 +118,7 @@ class PathTraceWorkGPU : public PathTraceWork { /* Kernel properties. */ bool kernel_uses_sorting(DeviceKernel kernel); bool kernel_creates_shadow_paths(DeviceKernel kernel); + bool kernel_creates_ao_paths(DeviceKernel kernel); bool kernel_is_shadow_path(DeviceKernel kernel); int kernel_max_active_path_index(DeviceKernel kernel); @@ -136,6 +139,7 @@ class PathTraceWorkGPU : public PathTraceWork { /* Shader sorting. */ device_vector<int> integrator_shader_sort_counter_; device_vector<int> integrator_shader_raytrace_sort_counter_; + device_vector<int> integrator_shader_sort_prefix_sum_; /* Path split. */ device_vector<int> integrator_next_shadow_path_index_; device_vector<int> integrator_next_shadow_catcher_path_index_; diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index b6df74e835a..fcb398f7e6d 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -282,11 +282,22 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B } extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_sorted_paths_array( - int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel) + kernel_gpu_integrator_sorted_paths_array(int num_states, + int num_states_limit, + int *indices, + int *num_indices, + int *key_counter, + int *key_prefix_sum, + int kernel) { gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) { + num_states, + num_states_limit, + indices, + num_indices, + key_counter, + key_prefix_sum, + [kernel](const int state) { return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ? INTEGRATOR_STATE(state, path, shader_sort_key) : GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; @@ -322,9 +333,10 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B } extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) - kernel_gpu_prefix_sum(int *values, int num_values) + kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values) { - gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values); + gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>( + counter, prefix_sum, num_values); } /* -------------------------------------------------------------------- diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h index a1349e82efb..aabe6e2e27a 100644 --- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -33,7 +33,8 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 #endif -template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values) +template<uint blocksize> +__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values) { if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) { return; @@ -41,8 +42,9 @@ template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, co int offset = 0; for (int i = 0; i < num_values; i++) { - const int new_offset = offset + values[i]; - values[i] = offset; + const int new_offset = offset + counter[i]; + prefix_sum[i] = offset; + counter[i] = 0; offset = new_offset; } } diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h index 9bca1fad22f..7570c5a6bbd 100644 --- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -35,8 +35,10 @@ CCL_NAMESPACE_BEGIN template<uint blocksize, typename GetKeyOp> __device__ void gpu_parallel_sorted_index_array(const uint num_states, + const int num_states_limit, int *indices, int *num_indices, + int *key_counter, int *key_prefix_sum, GetKeyOp get_key_op) { @@ -46,7 +48,15 @@ __device__ void gpu_parallel_sorted_index_array(const uint num_states, if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) { const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1); - indices[index] = state_index; + if (index < num_states_limit) { + /* Assign state index. */ + indices[index] = state_index; + } + else { + /* Can't process this state now, increase the counter again so that + * it will be handled in another iteration. */ + atomic_fetch_and_add_uint32(&key_counter[key], 1); + } } } diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/integrator_init_from_bake.h index 9bc115150ff..de916be24e7 100644 --- a/intern/cycles/kernel/integrator/integrator_init_from_bake.h +++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h @@ -185,7 +185,7 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg, /* Setup next kernel to execute. */ const int shader_index = shader & SHADER_MASK; const int shader_flags = kernel_tex_fetch(__shaders, shader_index).flags; - if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) { + if (shader_flags & SD_HAS_RAYTRACE) { INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index); } else { diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h index ef8dcb50115..c1315d48694 100644 --- a/intern/cycles/kernel/integrator/integrator_intersect_closest.h +++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h @@ -111,8 +111,7 @@ ccl_device_forceinline void integrator_intersect_shader_next_kernel( * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for * the matte path. */ - const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) || - (kernel_data.film.pass_ao != PASS_UNUSED)); + const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE); if (use_raytrace_kernel) { INTEGRATOR_PATH_NEXT_SORTED( diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h index 6e3220aa3b7..21a483a792b 100644 --- a/intern/cycles/kernel/integrator/integrator_megakernel.h +++ b/intern/cycles/kernel/integrator/integrator_megakernel.h @@ -34,16 +34,12 @@ ccl_device void integrator_megakernel(KernelGlobals kg, ccl_global float *ccl_restrict render_buffer) { /* Each kernel indicates the next kernel to execute, so here we simply - * have to check what that kernel is and execute it. - * - * TODO: investigate if we can use device side enqueue for GPUs to avoid - * having to compile this big kernel. */ + * have to check what that kernel is and execute it. */ while (true) { + /* Handle any shadow paths before we potentially create more shadow paths. */ const uint32_t shadow_queued_kernel = INTEGRATOR_STATE( &state->shadow, shadow_path, queued_kernel); - if (shadow_queued_kernel) { - /* First handle any shadow paths before we potentially create more shadow paths. */ switch (shadow_queued_kernel) { case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: integrator_intersect_shadow(kg, &state->shadow); @@ -55,10 +51,30 @@ ccl_device void integrator_megakernel(KernelGlobals kg, kernel_assert(0); break; } + continue; + } + + /* Handle any AO paths before we potentially create more AO paths. */ + const uint32_t ao_queued_kernel = INTEGRATOR_STATE(&state->ao, shadow_path, queued_kernel); + if (ao_queued_kernel) { + switch (ao_queued_kernel) { + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + integrator_intersect_shadow(kg, &state->ao); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + integrator_shade_shadow(kg, &state->ao, render_buffer); + break; + default: + kernel_assert(0); + break; + } + continue; } - else if (INTEGRATOR_STATE(state, path, queued_kernel)) { - /* Then handle regular path kernels. */ - switch (INTEGRATOR_STATE(state, path, queued_kernel)) { + + /* Then handle regular path kernels. */ + const uint32_t queued_kernel = INTEGRATOR_STATE(state, path, queued_kernel); + if (queued_kernel) { + switch (queued_kernel) { case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: integrator_intersect_closest(kg, state); break; @@ -87,10 +103,10 @@ ccl_device void integrator_megakernel(KernelGlobals kg, kernel_assert(0); break; } + continue; } - else { - break; - } + + break; } } diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/integrator_shade_background.h index d98e53e6bbf..287c54d7243 100644 --- a/intern/cycles/kernel/integrator/integrator_shade_background.h +++ b/intern/cycles/kernel/integrator/integrator_shade_background.h @@ -198,7 +198,7 @@ ccl_device void integrator_shade_background(KernelGlobals kg, const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim, isect_type); const int shader_flags = kernel_tex_fetch(__shaders, shader).flags; - if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) { + if (shader_flags & SD_HAS_RAYTRACE) { INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h index 0108ba1373c..2fb0dcc2097 100644 --- a/intern/cycles/kernel/integrator/integrator_shade_surface.h +++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h @@ -168,7 +168,8 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg, const bool is_light = light_sample_is_light(&ls); /* Branch off shadow kernel. */ - INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); + INTEGRATOR_SHADOW_PATH_INIT( + shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow); /* Copy volume stack and enter/exit volume. */ integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state); @@ -324,26 +325,14 @@ ccl_device_forceinline bool integrate_surface_volume_only_bounce(IntegratorState } #endif -#if defined(__AO__) && defined(__SHADER_RAYTRACE__) +#if defined(__AO__) ccl_device_forceinline void integrate_surface_ao_pass( KernelGlobals kg, - ConstIntegratorState state, - ccl_private const ShaderData *ccl_restrict sd, - ccl_private const RNGState *ccl_restrict rng_state, - ccl_global float *ccl_restrict render_buffer) -{ -# ifdef __KERNEL_OPTIX__ - optixDirectCall<void>(2, kg, state, sd, rng_state, render_buffer); -} - -extern "C" __device__ void __direct_callable__ao_pass( - KernelGlobals kg, - ConstIntegratorState state, + IntegratorState state, ccl_private const ShaderData *ccl_restrict sd, ccl_private const RNGState *ccl_restrict rng_state, ccl_global float *ccl_restrict render_buffer) { -# endif /* __KERNEL_OPTIX__ */ float bsdf_u, bsdf_v; path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); @@ -352,24 +341,48 @@ extern "C" __device__ void __direct_callable__ao_pass( float ao_pdf; sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray ray ccl_optional_struct_init; - ray.P = ray_offset(sd->P, sd->Ng); - ray.D = ao_D; - ray.t = kernel_data.integrator.ao_bounces_distance; - ray.time = sd->time; - ray.dP = differential_zero_compact(); - ray.dD = differential_zero_compact(); - - Intersection isect ccl_optional_struct_init; - if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) { - ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer); - const float3 throughput = INTEGRATOR_STATE(state, path, throughput); - kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput); - } + if (!(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f)) { + return; } + + Ray ray ccl_optional_struct_init; + ray.P = ray_offset(sd->P, sd->Ng); + ray.D = ao_D; + ray.t = kernel_data.integrator.ao_bounces_distance; + ray.time = sd->time; + ray.dP = differential_zero_compact(); + ray.dD = differential_zero_compact(); + + /* Branch off shadow kernel. */ + INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, ao); + + /* Copy volume stack and enter/exit volume. */ + integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state); + + /* Write shadow ray and associated state to global memory. */ + integrator_state_write_shadow_ray(kg, shadow_state, &ray); + + /* Copy state from main path to shadow path. */ + const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce); + const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce); + uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag) | PATH_RAY_SHADOW_FOR_AO; + const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * shader_bsdf_alpha(kg, sd); + + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE( + state, path, render_pixel_index); + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_offset) = + INTEGRATOR_STATE(state, path, rng_offset) - + PRNG_BOUNCE_NUM * INTEGRATOR_STATE(state, path, transparent_bounce); + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_hash) = INTEGRATOR_STATE( + state, path, rng_hash); + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, sample) = INTEGRATOR_STATE( + state, path, sample); + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, flag) = shadow_flag; + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, bounce) = bounce; + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transparent_bounce) = transparent_bounce; + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput; } -#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */ +#endif /* defined(__AO__) */ template<uint node_feature_mask> ccl_device bool integrate_surface(KernelGlobals kg, @@ -474,14 +487,12 @@ ccl_device bool integrate_surface(KernelGlobals kg, PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT); integrate_surface_direct_light(kg, state, &sd, &rng_state); -#if defined(__AO__) && defined(__SHADER_RAYTRACE__) +#if defined(__AO__) /* Ambient occlusion pass. */ - if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) { - if ((kernel_data.film.pass_ao != PASS_UNUSED) && - (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) { - PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO); - integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer); - } + if ((kernel_data.film.pass_ao != PASS_UNUSED) && + (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) { + PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO); + integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer); } #endif diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h index 13a5e7bda05..1dd701237a8 100644 --- a/intern/cycles/kernel/integrator/integrator_shade_volume.h +++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h @@ -776,7 +776,8 @@ ccl_device_forceinline void integrate_volume_direct_light( const bool is_light = light_sample_is_light(ls); /* Branch off shadow kernel. */ - INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); + INTEGRATOR_SHADOW_PATH_INIT( + shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow); /* Write shadow ray and associated state to global memory. */ integrator_state_write_shadow_ray(kg, shadow_state, &ray); diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h index 4f21ab35d1f..84efcb00349 100644 --- a/intern/cycles/kernel/integrator/integrator_state.h +++ b/intern/cycles/kernel/integrator/integrator_state.h @@ -92,6 +92,7 @@ typedef struct IntegratorStateCPU { #undef KERNEL_STRUCT_VOLUME_STACK_SIZE IntegratorShadowStateCPU shadow; + IntegratorShadowStateCPU ao; } IntegratorStateCPU; /* Path Queue diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h index df8fb5e0e46..1569bf68e24 100644 --- a/intern/cycles/kernel/integrator/integrator_state_flow.h +++ b/intern/cycles/kernel/integrator/integrator_state_flow.h @@ -63,7 +63,7 @@ CCL_NAMESPACE_BEGIN &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; -# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \ +# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \ IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \ &kernel_integrator_state.next_shadow_path_index[0], 1); \ atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ @@ -129,8 +129,8 @@ CCL_NAMESPACE_BEGIN (void)current_kernel; \ } -# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \ - IntegratorShadowState shadow_state = &state->shadow; \ +# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \ + IntegratorShadowState shadow_state = &state->shadow_type; \ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; # define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \ { \ diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h index e9517a82453..e3bf9db80f7 100644 --- a/intern/cycles/kernel/integrator/integrator_subsurface.h +++ b/intern/cycles/kernel/integrator/integrator_subsurface.h @@ -182,7 +182,7 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat const int shader = intersection_get_shader(kg, &ss_isect.hits[0]); const int shader_flags = kernel_tex_fetch(__shaders, shader).flags; - if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) { + if (shader_flags & SD_HAS_RAYTRACE) { INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 848aaa18aae..54492bef974 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -408,6 +408,13 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg, const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag); const int sample = INTEGRATOR_STATE(state, shadow_path, sample); + /* Ambient occlusion. */ + if (path_flag & PATH_RAY_SHADOW_FOR_AO) { + kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, contribution); + return; + } + + /* Direct light shadow. */ kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer); #ifdef __PASSES__ diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index fa8de14916e..428eb3498f7 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -28,6 +28,7 @@ ccl_device_inline void path_state_init_queues(IntegratorState state) INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; #ifdef __KERNEL_CPU__ INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0; + INTEGRATOR_STATE_WRITE(&state->ao, shadow_path, queued_kernel) = 0; #endif } diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index fa8453b99cb..4bdd8185ca6 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -285,21 +285,22 @@ enum PathRayFlag { PATH_RAY_VOLUME_PASS = (1U << 26U), PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS), - /* Shadow ray is for a light or surface. */ + /* Shadow ray is for a light or surface, or AO. */ PATH_RAY_SHADOW_FOR_LIGHT = (1U << 27U), + PATH_RAY_SHADOW_FOR_AO = (1U << 28U), /* A shadow catcher object was hit and the path was split into two. */ - PATH_RAY_SHADOW_CATCHER_HIT = (1U << 28U), + PATH_RAY_SHADOW_CATCHER_HIT = (1U << 29U), /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into * their dedicated pass for later division. * * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling * which is separate from the light passes. */ - PATH_RAY_SHADOW_CATCHER_PASS = (1U << 29U), + PATH_RAY_SHADOW_CATCHER_PASS = (1U << 30U), /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */ - PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 30U), + PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 31U), }; /* Configure ray visibility bits for rays and objects respectively, diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index 48f87ea3bf7..381f794545a 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -677,10 +677,6 @@ uint Film::get_kernel_features(const Scene *scene) const kernel_features |= KERNEL_FEATURE_SHADOW_PASS; } } - - if (pass_type == PASS_AO) { - kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE; - } } return kernel_features; |