Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.cpp58
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.h8
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h22
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_prefix_sum.h8
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_sorted_index.h12
-rw-r--r--intern/cycles/kernel/integrator/integrator_init_from_bake.h2
-rw-r--r--intern/cycles/kernel/integrator/integrator_intersect_closest.h3
-rw-r--r--intern/cycles/kernel/integrator/integrator_megakernel.h40
-rw-r--r--intern/cycles/kernel/integrator/integrator_shade_background.h2
-rw-r--r--intern/cycles/kernel/integrator/integrator_shade_surface.h87
-rw-r--r--intern/cycles/kernel/integrator/integrator_shade_volume.h3
-rw-r--r--intern/cycles/kernel/integrator/integrator_state.h1
-rw-r--r--intern/cycles/kernel/integrator/integrator_state_flow.h6
-rw-r--r--intern/cycles/kernel/integrator/integrator_subsurface.h2
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h7
-rw-r--r--intern/cycles/kernel/kernel_path_state.h1
-rw-r--r--intern/cycles/kernel/kernel_types.h9
-rw-r--r--intern/cycles/render/film.cpp4
18 files changed, 178 insertions, 97 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 18aa5dda70d..a4788c437a1 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -78,6 +78,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
integrator_shader_raytrace_sort_counter_(
device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+ integrator_shader_sort_prefix_sum_(
+ device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
integrator_next_shadow_path_index_(
device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
integrator_next_shadow_catcher_path_index_(
@@ -200,6 +202,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
integrator_shader_raytrace_sort_counter_.zero_to_device();
+ integrator_shader_sort_prefix_sum_.alloc(max_shaders);
+ integrator_shader_sort_prefix_sum_.zero_to_device();
+
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
(int *)integrator_shader_sort_counter_.device_pointer;
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
@@ -374,9 +379,12 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
/* For kernels that add shadow paths, check if there is enough space available.
* If not, schedule shadow kernels first to clear out the shadow paths. */
+ int num_paths_limit = INT_MAX;
+
if (kernel_creates_shadow_paths(kernel)) {
- if (max_num_paths_ - integrator_next_shadow_path_index_.data()[0] <
- queue_counter->num_queued[kernel]) {
+ const int available_shadow_paths = max_num_paths_ -
+ integrator_next_shadow_path_index_.data()[0];
+ if (available_shadow_paths < queue_counter->num_queued[kernel]) {
if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
return true;
@@ -386,10 +394,14 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
return true;
}
}
+ else if (kernel_creates_ao_paths(kernel)) {
+ /* AO kernel creates two shadow paths, so limit number of states to schedule. */
+ num_paths_limit = available_shadow_paths / 2;
+ }
}
/* Schedule kernel with maximum number of queued items. */
- enqueue_path_iteration(kernel);
+ enqueue_path_iteration(kernel, num_paths_limit);
/* Update next shadow path index for kernels that can add shadow paths. */
if (kernel_creates_shadow_paths(kernel)) {
@@ -399,7 +411,7 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
return true;
}
-void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit)
{
void *d_path_index = (void *)NULL;
@@ -414,7 +426,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
work_size = num_queued;
d_path_index = (void *)queued_paths_.device_pointer;
- compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+ compute_sorted_queued_paths(
+ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
}
else if (num_queued < work_size) {
work_size = num_queued;
@@ -430,6 +443,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
}
}
+ work_size = min(work_size, num_paths_limit);
+
DCHECK_LE(work_size, max_num_paths_);
switch (kernel) {
@@ -464,17 +479,20 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
}
}
-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
+ DeviceKernel queued_kernel,
+ const int num_paths_limit)
{
int d_queued_kernel = queued_kernel;
void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
- assert(d_counter != nullptr);
+ void *d_prefix_sum = (void *)integrator_shader_sort_prefix_sum_.device_pointer;
+ assert(d_counter != nullptr && d_prefix_sum != nullptr);
/* Compute prefix sum of number of active paths with each shader. */
{
const int work_size = 1;
int max_shaders = device_scene_->data.max_shaders;
- void *args[] = {&d_counter, &max_shaders};
+ void *args[] = {&d_counter, &d_prefix_sum, &max_shaders};
queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
}
@@ -483,29 +501,24 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
/* Launch kernel to fill the active paths arrays. */
{
/* TODO: this could be smaller for terminated paths based on amount of work we want
- * to schedule. */
+ * to schedule, and also based on num_paths_limit.
+ *
+ * Also, when the number paths is limited it may be better to prefer paths from the
+ * end of the array since compaction would need to do less work. */
const int work_size = kernel_max_active_path_index(queued_kernel);
void *d_queued_paths = (void *)queued_paths_.device_pointer;
void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
void *args[] = {const_cast<int *>(&work_size),
+ const_cast<int *>(&num_paths_limit),
&d_queued_paths,
&d_num_queued_paths,
&d_counter,
+ &d_prefix_sum,
&d_queued_kernel};
queue_->enqueue(kernel, work_size, args);
}
-
- if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
- queue_->zero_to_device(integrator_shader_sort_counter_);
- }
- else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
- queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
- }
- else {
- assert(0);
- }
}
void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
@@ -1026,6 +1039,13 @@ bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel)
kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
}
+bool PathTraceWorkGPU::kernel_creates_ao_paths(DeviceKernel kernel)
+{
+ return (device_scene_->data.film.pass_ao != PASS_UNUSED) &&
+ (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+}
+
bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel)
{
return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index dd2c1c197ae..e1f6c09d334 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -79,10 +79,12 @@ class PathTraceWorkGPU : public PathTraceWork {
const int num_predicted_splits);
bool enqueue_path_iteration();
- void enqueue_path_iteration(DeviceKernel kernel);
+ void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
- void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+ void compute_sorted_queued_paths(DeviceKernel kernel,
+ DeviceKernel queued_kernel,
+ const int num_paths_limit);
void compact_states(const int num_active_paths);
@@ -116,6 +118,7 @@ class PathTraceWorkGPU : public PathTraceWork {
/* Kernel properties. */
bool kernel_uses_sorting(DeviceKernel kernel);
bool kernel_creates_shadow_paths(DeviceKernel kernel);
+ bool kernel_creates_ao_paths(DeviceKernel kernel);
bool kernel_is_shadow_path(DeviceKernel kernel);
int kernel_max_active_path_index(DeviceKernel kernel);
@@ -136,6 +139,7 @@ class PathTraceWorkGPU : public PathTraceWork {
/* Shader sorting. */
device_vector<int> integrator_shader_sort_counter_;
device_vector<int> integrator_shader_raytrace_sort_counter_;
+ device_vector<int> integrator_shader_sort_prefix_sum_;
/* Path split. */
device_vector<int> integrator_next_shadow_path_index_;
device_vector<int> integrator_next_shadow_catcher_path_index_;
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index b6df74e835a..fcb398f7e6d 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -282,11 +282,22 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
}
extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_sorted_paths_array(
- int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+ kernel_gpu_integrator_sorted_paths_array(int num_states,
+ int num_states_limit,
+ int *indices,
+ int *num_indices,
+ int *key_counter,
+ int *key_prefix_sum,
+ int kernel)
{
gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+ num_states,
+ num_states_limit,
+ indices,
+ num_indices,
+ key_counter,
+ key_prefix_sum,
+ [kernel](const int state) {
return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
INTEGRATOR_STATE(state, path, shader_sort_key) :
GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
@@ -322,9 +333,10 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
}
extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
- kernel_gpu_prefix_sum(int *values, int num_values)
+ kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
{
- gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+ gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
+ counter, prefix_sum, num_values);
}
/* --------------------------------------------------------------------
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index a1349e82efb..aabe6e2e27a 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,7 +33,8 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
#endif
-template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+template<uint blocksize>
+__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
{
if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
return;
@@ -41,8 +42,9 @@ template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, co
int offset = 0;
for (int i = 0; i < num_values; i++) {
- const int new_offset = offset + values[i];
- values[i] = offset;
+ const int new_offset = offset + counter[i];
+ prefix_sum[i] = offset;
+ counter[i] = 0;
offset = new_offset;
}
}
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 9bca1fad22f..7570c5a6bbd 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -35,8 +35,10 @@ CCL_NAMESPACE_BEGIN
template<uint blocksize, typename GetKeyOp>
__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+ const int num_states_limit,
int *indices,
int *num_indices,
+ int *key_counter,
int *key_prefix_sum,
GetKeyOp get_key_op)
{
@@ -46,7 +48,15 @@ __device__ void gpu_parallel_sorted_index_array(const uint num_states,
if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
- indices[index] = state_index;
+ if (index < num_states_limit) {
+ /* Assign state index. */
+ indices[index] = state_index;
+ }
+ else {
+ /* Can't process this state now, increase the counter again so that
+ * it will be handled in another iteration. */
+ atomic_fetch_and_add_uint32(&key_counter[key], 1);
+ }
}
}
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
index 9bc115150ff..de916be24e7 100644
--- a/intern/cycles/kernel/integrator/integrator_init_from_bake.h
+++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
@@ -185,7 +185,7 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
/* Setup next kernel to execute. */
const int shader_index = shader & SHADER_MASK;
const int shader_flags = kernel_tex_fetch(__shaders, shader_index).flags;
- if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+ if (shader_flags & SD_HAS_RAYTRACE) {
INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
}
else {
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
index ef8dcb50115..c1315d48694 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_closest.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -111,8 +111,7 @@ ccl_device_forceinline void integrator_intersect_shader_next_kernel(
* Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
* the matte path. */
- const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) ||
- (kernel_data.film.pass_ao != PASS_UNUSED));
+ const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);
if (use_raytrace_kernel) {
INTEGRATOR_PATH_NEXT_SORTED(
diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h
index 6e3220aa3b7..21a483a792b 100644
--- a/intern/cycles/kernel/integrator/integrator_megakernel.h
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@@ -34,16 +34,12 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
ccl_global float *ccl_restrict render_buffer)
{
/* Each kernel indicates the next kernel to execute, so here we simply
- * have to check what that kernel is and execute it.
- *
- * TODO: investigate if we can use device side enqueue for GPUs to avoid
- * having to compile this big kernel. */
+ * have to check what that kernel is and execute it. */
while (true) {
+ /* Handle any shadow paths before we potentially create more shadow paths. */
const uint32_t shadow_queued_kernel = INTEGRATOR_STATE(
&state->shadow, shadow_path, queued_kernel);
-
if (shadow_queued_kernel) {
- /* First handle any shadow paths before we potentially create more shadow paths. */
switch (shadow_queued_kernel) {
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
integrator_intersect_shadow(kg, &state->shadow);
@@ -55,10 +51,30 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
kernel_assert(0);
break;
}
+ continue;
+ }
+
+ /* Handle any AO paths before we potentially create more AO paths. */
+ const uint32_t ao_queued_kernel = INTEGRATOR_STATE(&state->ao, shadow_path, queued_kernel);
+ if (ao_queued_kernel) {
+ switch (ao_queued_kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+ integrator_intersect_shadow(kg, &state->ao);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+ integrator_shade_shadow(kg, &state->ao, render_buffer);
+ break;
+ default:
+ kernel_assert(0);
+ break;
+ }
+ continue;
}
- else if (INTEGRATOR_STATE(state, path, queued_kernel)) {
- /* Then handle regular path kernels. */
- switch (INTEGRATOR_STATE(state, path, queued_kernel)) {
+
+ /* Then handle regular path kernels. */
+ const uint32_t queued_kernel = INTEGRATOR_STATE(state, path, queued_kernel);
+ if (queued_kernel) {
+ switch (queued_kernel) {
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
integrator_intersect_closest(kg, state);
break;
@@ -87,10 +103,10 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
kernel_assert(0);
break;
}
+ continue;
}
- else {
- break;
- }
+
+ break;
}
}
diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/integrator_shade_background.h
index d98e53e6bbf..287c54d7243 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_background.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_background.h
@@ -198,7 +198,7 @@ ccl_device void integrator_shade_background(KernelGlobals kg,
const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim, isect_type);
const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
- if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+ if (shader_flags & SD_HAS_RAYTRACE) {
INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
shader);
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
index 0108ba1373c..2fb0dcc2097 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -168,7 +168,8 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
const bool is_light = light_sample_is_light(&ls);
/* Branch off shadow kernel. */
- INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+ INTEGRATOR_SHADOW_PATH_INIT(
+ shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);
/* Copy volume stack and enter/exit volume. */
integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
@@ -324,26 +325,14 @@ ccl_device_forceinline bool integrate_surface_volume_only_bounce(IntegratorState
}
#endif
-#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+#if defined(__AO__)
ccl_device_forceinline void integrate_surface_ao_pass(
KernelGlobals kg,
- ConstIntegratorState state,
- ccl_private const ShaderData *ccl_restrict sd,
- ccl_private const RNGState *ccl_restrict rng_state,
- ccl_global float *ccl_restrict render_buffer)
-{
-# ifdef __KERNEL_OPTIX__
- optixDirectCall<void>(2, kg, state, sd, rng_state, render_buffer);
-}
-
-extern "C" __device__ void __direct_callable__ao_pass(
- KernelGlobals kg,
- ConstIntegratorState state,
+ IntegratorState state,
ccl_private const ShaderData *ccl_restrict sd,
ccl_private const RNGState *ccl_restrict rng_state,
ccl_global float *ccl_restrict render_buffer)
{
-# endif /* __KERNEL_OPTIX__ */
float bsdf_u, bsdf_v;
path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
@@ -352,24 +341,48 @@ extern "C" __device__ void __direct_callable__ao_pass(
float ao_pdf;
sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
- if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
- Ray ray ccl_optional_struct_init;
- ray.P = ray_offset(sd->P, sd->Ng);
- ray.D = ao_D;
- ray.t = kernel_data.integrator.ao_bounces_distance;
- ray.time = sd->time;
- ray.dP = differential_zero_compact();
- ray.dD = differential_zero_compact();
-
- Intersection isect ccl_optional_struct_init;
- if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) {
- ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
- const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
- kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput);
- }
+ if (!(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f)) {
+ return;
}
+
+ Ray ray ccl_optional_struct_init;
+ ray.P = ray_offset(sd->P, sd->Ng);
+ ray.D = ao_D;
+ ray.t = kernel_data.integrator.ao_bounces_distance;
+ ray.time = sd->time;
+ ray.dP = differential_zero_compact();
+ ray.dD = differential_zero_compact();
+
+ /* Branch off shadow kernel. */
+ INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, ao);
+
+ /* Copy volume stack and enter/exit volume. */
+ integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
+
+ /* Write shadow ray and associated state to global memory. */
+ integrator_state_write_shadow_ray(kg, shadow_state, &ray);
+
+ /* Copy state from main path to shadow path. */
+ const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
+ const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
+ uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag) | PATH_RAY_SHADOW_FOR_AO;
+ const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * shader_bsdf_alpha(kg, sd);
+
+ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
+ state, path, render_pixel_index);
+ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_offset) =
+ INTEGRATOR_STATE(state, path, rng_offset) -
+ PRNG_BOUNCE_NUM * INTEGRATOR_STATE(state, path, transparent_bounce);
+ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_hash) = INTEGRATOR_STATE(
+ state, path, rng_hash);
+ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, sample) = INTEGRATOR_STATE(
+ state, path, sample);
+ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, flag) = shadow_flag;
+ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, bounce) = bounce;
+ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transparent_bounce) = transparent_bounce;
+ INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput;
}
-#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */
+#endif /* defined(__AO__) */
template<uint node_feature_mask>
ccl_device bool integrate_surface(KernelGlobals kg,
@@ -474,14 +487,12 @@ ccl_device bool integrate_surface(KernelGlobals kg,
PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
integrate_surface_direct_light(kg, state, &sd, &rng_state);
-#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+#if defined(__AO__)
/* Ambient occlusion pass. */
- if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) {
- if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
- (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) {
- PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
- integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer);
- }
+ if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
+ (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) {
+ PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
+ integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer);
}
#endif
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
index 13a5e7bda05..1dd701237a8 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -776,7 +776,8 @@ ccl_device_forceinline void integrate_volume_direct_light(
const bool is_light = light_sample_is_light(ls);
/* Branch off shadow kernel. */
- INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+ INTEGRATOR_SHADOW_PATH_INIT(
+ shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);
/* Write shadow ray and associated state to global memory. */
integrator_state_write_shadow_ray(kg, shadow_state, &ray);
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
index 4f21ab35d1f..84efcb00349 100644
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -92,6 +92,7 @@ typedef struct IntegratorStateCPU {
#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
IntegratorShadowStateCPU shadow;
+ IntegratorShadowStateCPU ao;
} IntegratorStateCPU;
/* Path Queue
diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h
index df8fb5e0e46..1569bf68e24 100644
--- a/intern/cycles/kernel/integrator/integrator_state_flow.h
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@@ -63,7 +63,7 @@ CCL_NAMESPACE_BEGIN
&kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
-# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
+# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \
&kernel_integrator_state.next_shadow_path_index[0], 1); \
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
@@ -129,8 +129,8 @@ CCL_NAMESPACE_BEGIN
(void)current_kernel; \
}
-# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
- IntegratorShadowState shadow_state = &state->shadow; \
+# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
+ IntegratorShadowState shadow_state = &state->shadow_type; \
INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
# define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
{ \
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h
index e9517a82453..e3bf9db80f7 100644
--- a/intern/cycles/kernel/integrator/integrator_subsurface.h
+++ b/intern/cycles/kernel/integrator/integrator_subsurface.h
@@ -182,7 +182,7 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat
const int shader = intersection_get_shader(kg, &ss_isect.hits[0]);
const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
- if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+ if (shader_flags & SD_HAS_RAYTRACE) {
INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
shader);
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 848aaa18aae..54492bef974 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -408,6 +408,13 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
const int sample = INTEGRATOR_STATE(state, shadow_path, sample);
+ /* Ambient occlusion. */
+ if (path_flag & PATH_RAY_SHADOW_FOR_AO) {
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, contribution);
+ return;
+ }
+
+ /* Direct light shadow. */
kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer);
#ifdef __PASSES__
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index fa8de14916e..428eb3498f7 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -28,6 +28,7 @@ ccl_device_inline void path_state_init_queues(IntegratorState state)
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
#ifdef __KERNEL_CPU__
INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0;
+ INTEGRATOR_STATE_WRITE(&state->ao, shadow_path, queued_kernel) = 0;
#endif
}
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index fa8453b99cb..4bdd8185ca6 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -285,21 +285,22 @@ enum PathRayFlag {
PATH_RAY_VOLUME_PASS = (1U << 26U),
PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),
- /* Shadow ray is for a light or surface. */
+ /* Shadow ray is for a light or surface, or AO. */
PATH_RAY_SHADOW_FOR_LIGHT = (1U << 27U),
+ PATH_RAY_SHADOW_FOR_AO = (1U << 28U),
/* A shadow catcher object was hit and the path was split into two. */
- PATH_RAY_SHADOW_CATCHER_HIT = (1U << 28U),
+ PATH_RAY_SHADOW_CATCHER_HIT = (1U << 29U),
/* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
* their dedicated pass for later division.
*
* NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
* which is separate from the light passes. */
- PATH_RAY_SHADOW_CATCHER_PASS = (1U << 29U),
+ PATH_RAY_SHADOW_CATCHER_PASS = (1U << 30U),
/* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
- PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 30U),
+ PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 31U),
};
/* Configure ray visibility bits for rays and objects respectively,
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 48f87ea3bf7..381f794545a 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -677,10 +677,6 @@ uint Film::get_kernel_features(const Scene *scene) const
kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
}
}
-
- if (pass_type == PASS_AO) {
- kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE;
- }
}
return kernel_features;