Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhil Gosch <phil@saphirestudio.at>2017-09-29 23:52:29 +0300
committerPhil Gosch <phil@saphirestudio.at>2017-09-29 23:52:29 +0300
commit2b1ec9c18fadf6b5cd71a7d4122776f5d4b8ac33 (patch)
tree01b477f0ea027ab5908f332f6102db6a1d381b76 /intern/cycles/kernel
parent5cdd4388ffaad5ade916dba3c18e2bc679e19747 (diff)
parente3546a5097b1b30a3c694ce0f6b98c2f7e70510c (diff)
Merge remote-tracking branch 'origin/master' into soc-2016-uv_toolssoc-2016-uv_tools
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r--intern/cycles/kernel/CMakeLists.txt3
-rw-r--r--intern/cycles/kernel/bvh/bvh.h12
-rw-r--r--intern/cycles/kernel/bvh/bvh_nodes.h26
-rw-r--r--intern/cycles/kernel/bvh/bvh_shadow_all.h71
-rw-r--r--intern/cycles/kernel/bvh/bvh_traversal.h60
-rw-r--r--intern/cycles/kernel/bvh/qbvh_nodes.h8
-rw-r--r--intern/cycles/kernel/bvh/qbvh_shadow_all.h65
-rw-r--r--intern/cycles/kernel/bvh/qbvh_traversal.h54
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi.h23
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h25
-rw-r--r--intern/cycles/kernel/closure/bssrdf.h5
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h80
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h18
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h103
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h16
-rw-r--r--intern/cycles/kernel/geom/geom.h1
-rw-r--r--intern/cycles/kernel/geom/geom_curve.h904
-rw-r--r--intern/cycles/kernel/geom/geom_curve_intersect.h934
-rw-r--r--intern/cycles/kernel/geom/geom_object.h5
-rw-r--r--intern/cycles/kernel/geom/geom_volume.h8
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h345
-rw-r--r--intern/cycles/kernel/kernel_bake.h65
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h10
-rw-r--r--intern/cycles/kernel/kernel_compat_opencl.h5
-rw-r--r--intern/cycles/kernel/kernel_debug.h56
-rw-r--r--intern/cycles/kernel/kernel_emission.h10
-rw-r--r--intern/cycles/kernel/kernel_globals.h68
-rw-r--r--intern/cycles/kernel/kernel_image_opencl.h66
-rw-r--r--intern/cycles/kernel/kernel_light.h302
-rw-r--r--intern/cycles/kernel/kernel_passes.h65
-rw-r--r--intern/cycles/kernel/kernel_path.h958
-rw-r--r--intern/cycles/kernel/kernel_path_branched.h227
-rw-r--r--intern/cycles/kernel/kernel_path_common.h8
-rw-r--r--intern/cycles/kernel/kernel_path_state.h78
-rw-r--r--intern/cycles/kernel/kernel_path_subsurface.h63
-rw-r--r--intern/cycles/kernel/kernel_path_surface.h55
-rw-r--r--intern/cycles/kernel/kernel_path_volume.h80
-rw-r--r--intern/cycles/kernel/kernel_random.h311
-rw-r--r--intern/cycles/kernel/kernel_shader.h232
-rw-r--r--intern/cycles/kernel/kernel_shadow.h185
-rw-r--r--intern/cycles/kernel/kernel_subsurface.h127
-rw-r--r--intern/cycles/kernel/kernel_textures.h181
-rw-r--r--intern/cycles/kernel/kernel_types.h171
-rw-r--r--intern/cycles/kernel/kernel_volume.h32
-rw-r--r--intern/cycles/kernel/kernel_work_stealing.h112
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse41.cpp1
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_config.h9
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_split.cu4
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl2
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel.cl47
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_data_init.cl11
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split.cl3
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split_function.h9
-rw-r--r--intern/cycles/kernel/osl/osl_globals.h2
-rw-r--r--intern/cycles/kernel/osl/osl_services.cpp5
-rw-r--r--intern/cycles/kernel/osl/osl_shader.cpp24
-rw-r--r--intern/cycles/kernel/osl/osl_shader.h8
-rw-r--r--intern/cycles/kernel/shaders/node_principled_bsdf.osl4
-rw-r--r--intern/cycles/kernel/split/kernel_branched.h11
-rw-r--r--intern/cycles/kernel/split/kernel_buffer_update.h74
-rw-r--r--intern/cycles/kernel/split/kernel_data_init.h29
-rw-r--r--intern/cycles/kernel/split/kernel_direct_lighting.h12
-rw-r--r--intern/cycles/kernel/split/kernel_do_volume.h17
-rw-r--r--intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h159
-rw-r--r--intern/cycles/kernel/split/kernel_indirect_background.h33
-rw-r--r--intern/cycles/kernel/split/kernel_indirect_subsurface.h1
-rw-r--r--intern/cycles/kernel/split/kernel_lamp_emission.h23
-rw-r--r--intern/cycles/kernel/split/kernel_next_iteration_setup.h5
-rw-r--r--intern/cycles/kernel/split/kernel_path_init.h54
-rw-r--r--intern/cycles/kernel/split/kernel_scene_intersect.h46
-rw-r--r--intern/cycles/kernel/split/kernel_shader_eval.h28
-rw-r--r--intern/cycles/kernel/split/kernel_shader_sort.h10
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_ao.h7
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_dl.h13
-rw-r--r--intern/cycles/kernel/split/kernel_split_common.h2
-rw-r--r--intern/cycles/kernel/split/kernel_split_data_types.h17
-rw-r--r--intern/cycles/kernel/split/kernel_subsurface_scatter.h35
-rw-r--r--intern/cycles/kernel/svm/svm_closure.h4
-rw-r--r--intern/cycles/kernel/svm/svm_image.h190
79 files changed, 3405 insertions, 3727 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 23e9bd311c4..b4ca16bdb48 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -79,7 +79,6 @@ set(SRC_HEADERS
kernel_compat_cpu.h
kernel_compat_cuda.h
kernel_compat_opencl.h
- kernel_debug.h
kernel_differential.h
kernel_emission.h
kernel_film.h
@@ -202,6 +201,7 @@ set(SRC_GEOM_HEADERS
geom/geom.h
geom/geom_attribute.h
geom/geom_curve.h
+ geom/geom_curve_intersect.h
geom/geom_motion_curve.h
geom/geom_motion_triangle.h
geom/geom_motion_triangle_intersect.h
@@ -233,6 +233,7 @@ set(SRC_FILTER_HEADERS
set(SRC_UTIL_HEADERS
../util/util_atomic.h
../util/util_color.h
+ ../util/util_defines.h
../util/util_half.h
../util/util_hash.h
../util/util_math.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 85741016b25..cf0c8542d69 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -233,7 +233,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
- int skip_object,
+ uint visibility,
uint max_hits,
uint *num_hits)
{
@@ -244,7 +244,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
return bvh_intersect_shadow_all_hair_motion(kg,
ray,
isect,
- skip_object,
+ visibility,
max_hits,
num_hits);
}
@@ -253,7 +253,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
return bvh_intersect_shadow_all_motion(kg,
ray,
isect,
- skip_object,
+ visibility,
max_hits,
num_hits);
}
@@ -264,7 +264,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
return bvh_intersect_shadow_all_hair(kg,
ray,
isect,
- skip_object,
+ visibility,
max_hits,
num_hits);
}
@@ -275,7 +275,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
return bvh_intersect_shadow_all_instancing(kg,
ray,
isect,
- skip_object,
+ visibility,
max_hits,
num_hits);
}
@@ -284,7 +284,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
return bvh_intersect_shadow_all(kg,
ray,
isect,
- skip_object,
+ visibility,
max_hits,
num_hits);
}
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 74a9ebf14e4..6c33dad5426 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -52,8 +52,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
- float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+ float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+ float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
@@ -61,8 +61,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
- float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+ float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+ float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
dist[0] = c0min;
dist[1] = c1min;
@@ -101,8 +101,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
- float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+ float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+ float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
@@ -110,8 +110,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
- float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+ float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+ float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
if(difl != 0.0f) {
float hdiff = 1.0f + difl;
@@ -483,8 +483,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
ssef tfar_y = max(lower_y, upper_y);
ssef tfar_z = max(lower_z, upper_z);
- const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
- const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
sseb vmask = tnear <= tfar;
dist[0] = tnear.f[0];
dist[1] = tnear.f[1];
@@ -545,8 +545,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
ssef tfar_y = max(lower_y, upper_y);
ssef tfar_z = max(lower_z, upper_z);
- const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
- const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
sseb vmask;
if(difl != 0.0f) {
const float round_down = 1.0f - difl;
@@ -615,7 +615,7 @@ ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
const float3& P,
const float3& dir,
const ssef& isect_near,
- const ssef& isect_far,
+ const ssef& isect_far,
const ssef& tsplat,
const ssef Psplat[3],
const ssef idirsplat[3],
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 267e098f912..a6a4353562c 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -45,7 +45,7 @@ ccl_device_inline
bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
- const int skip_object,
+ const uint visibility,
const uint max_hits,
uint *num_hits)
{
@@ -119,7 +119,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
idir,
isect_t,
node_addr,
- PATH_RAY_SHADOW,
+ visibility,
dist);
#else // __KERNEL_SSE2__
traverse_mask = NODE_INTERSECT(kg,
@@ -134,7 +134,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
idirsplat,
shufflexyz,
node_addr,
- PATH_RAY_SHADOW,
+ visibility,
dist);
#endif // __KERNEL_SSE2__
@@ -186,17 +186,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
/* primitive intersection */
while(prim_addr < prim_addr2) {
kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
-
-#ifdef __SHADOW_TRICKS__
- uint tri_object = (object == OBJECT_NONE)
- ? kernel_tex_fetch(__prim_object, prim_addr)
- : object;
- if(tri_object == skip_object) {
- ++prim_addr;
- continue;
- }
-#endif
-
bool hit;
/* todo: specialized intersect functions which don't fill in
@@ -209,7 +198,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
isect_array,
P,
dir,
- PATH_RAY_SHADOW,
+ visibility,
object,
prim_addr);
break;
@@ -221,7 +210,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
P,
dir,
ray->time,
- PATH_RAY_SHADOW,
+ visibility,
object,
prim_addr);
break;
@@ -232,30 +221,30 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
case PRIMITIVE_MOTION_CURVE: {
const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
- hit = bvh_cardinal_curve_intersect(kg,
- isect_array,
- P,
- dir,
- PATH_RAY_SHADOW,
- object,
- prim_addr,
- ray->time,
- curve_type,
- NULL,
- 0, 0);
+ hit = cardinal_curve_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ NULL,
+ 0, 0);
}
else {
- hit = bvh_curve_intersect(kg,
- isect_array,
- P,
- dir,
- PATH_RAY_SHADOW,
- object,
- prim_addr,
- ray->time,
- curve_type,
- NULL,
- 0, 0);
+ hit = curve_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ NULL,
+ 0, 0);
}
break;
}
@@ -402,7 +391,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
- const int skip_object,
+ const uint visibility,
const uint max_hits,
uint *num_hits)
{
@@ -411,7 +400,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
ray,
isect_array,
- skip_object,
+ visibility,
max_hits,
num_hits);
}
@@ -422,7 +411,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
return BVH_FUNCTION_FULL_NAME(BVH)(kg,
ray,
isect_array,
- skip_object,
+ visibility,
max_hits,
num_hits);
}
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index c58d3b0316c..ae8f54821f2 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -244,14 +244,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
{
/* shadow ray early termination */
#if defined(__KERNEL_SSE2__)
- if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ if(visibility & PATH_RAY_SHADOW_OPAQUE)
return true;
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect->t);
# endif
#else
- if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ if(visibility & PATH_RAY_SHADOW_OPAQUE)
return true;
#endif
}
@@ -274,14 +274,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
{
/* shadow ray early termination */
# if defined(__KERNEL_SSE2__)
- if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ if(visibility & PATH_RAY_SHADOW_OPAQUE)
return true;
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect->t);
# endif
# else
- if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ if(visibility & PATH_RAY_SHADOW_OPAQUE)
return true;
# endif
}
@@ -298,44 +298,44 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
bool hit;
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
- hit = bvh_cardinal_curve_intersect(kg,
- isect,
- P,
- dir,
- visibility,
- object,
- prim_addr,
- ray->time,
- curve_type,
- lcg_state,
- difl,
- extmax);
+ hit = cardinal_curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ lcg_state,
+ difl,
+ extmax);
}
else {
- hit = bvh_curve_intersect(kg,
- isect,
- P,
- dir,
- visibility,
- object,
- prim_addr,
- ray->time,
- curve_type,
- lcg_state,
- difl,
- extmax);
+ hit = curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ lcg_state,
+ difl,
+ extmax);
}
if(hit) {
/* shadow ray early termination */
# if defined(__KERNEL_SSE2__)
- if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ if(visibility & PATH_RAY_SHADOW_OPAQUE)
return true;
tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
# if BVH_FEATURE(BVH_HAIR)
tfar = ssef(isect->t);
# endif
# else
- if(visibility == PATH_RAY_SHADOW_OPAQUE)
+ if(visibility & PATH_RAY_SHADOW_OPAQUE)
return true;
# endif
}
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 6d22f0b0d6a..3036efd4198 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -126,8 +126,8 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg
const sseb vmask = cast(tnear) > cast(tfar);
int mask = (int)movemask(vmask)^0xf;
#else
- const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
- const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
const sseb vmask = tnear <= tfar;
int mask = (int)movemask(vmask);
#endif
@@ -174,8 +174,8 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust(
const float round_down = 1.0f - difl;
const float round_up = 1.0f + difl;
- const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
- const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
const sseb vmask = round_down*tnear <= round_up*tfar;
*dist = tnear;
return (int)movemask(vmask);
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index ce474438f2c..522213f30ca 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -33,7 +33,7 @@
ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
- const int skip_object,
+ const uint visibility,
const uint max_hits,
uint *num_hits)
{
@@ -107,7 +107,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(false
#ifdef __VISIBILITY_FLAG__
- || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
+ || ((__float_as_uint(inodes.x) & visibility) == 0)
#endif
#if BVH_FEATURE(BVH_MOTION)
|| UNLIKELY(ray->time < inodes.y)
@@ -244,7 +244,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
if(node_addr < 0) {
float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
#ifdef __VISIBILITY_FLAG__
- if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+ if((__float_as_uint(leaf.z) & visibility) == 0) {
/* Pop. */
node_addr = traversal_stack[stack_ptr].addr;
--stack_ptr;
@@ -268,17 +268,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
/* Primitive intersection. */
while(prim_addr < prim_addr2) {
kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
-
-#ifdef __SHADOW_TRICKS__
- uint tri_object = (object == OBJECT_NONE)
- ? kernel_tex_fetch(__prim_object, prim_addr)
- : object;
- if(tri_object == skip_object) {
- ++prim_addr;
- continue;
- }
-#endif
-
bool hit;
/* todo: specialized intersect functions which don't fill in
@@ -291,7 +280,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
isect_array,
P,
dir,
- PATH_RAY_SHADOW,
+ visibility,
object,
prim_addr);
break;
@@ -303,7 +292,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
P,
dir,
ray->time,
- PATH_RAY_SHADOW,
+ visibility,
object,
prim_addr);
break;
@@ -314,30 +303,30 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
case PRIMITIVE_MOTION_CURVE: {
const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
- hit = bvh_cardinal_curve_intersect(kg,
- isect_array,
- P,
- dir,
- PATH_RAY_SHADOW,
- object,
- prim_addr,
- ray->time,
- curve_type,
- NULL,
- 0, 0);
+ hit = cardinal_curve_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ NULL,
+ 0, 0);
}
else {
- hit = bvh_curve_intersect(kg,
- isect_array,
- P,
- dir,
- PATH_RAY_SHADOW,
- object,
- prim_addr,
- ray->time,
- curve_type,
- NULL,
- 0, 0);
+ hit = curve_intersect(kg,
+ isect_array,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ NULL,
+ 0, 0);
}
break;
}
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index fca75a1d416..335a4afd47a 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -340,7 +340,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
prim_addr)) {
tfar = ssef(isect->t);
/* Shadow ray early termination. */
- if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+ if(visibility & PATH_RAY_SHADOW_OPAQUE) {
return true;
}
}
@@ -362,7 +362,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
prim_addr)) {
tfar = ssef(isect->t);
/* Shadow ray early termination. */
- if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+ if(visibility & PATH_RAY_SHADOW_OPAQUE) {
return true;
}
}
@@ -379,37 +379,37 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
bool hit;
if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
- hit = bvh_cardinal_curve_intersect(kg,
- isect,
- P,
- dir,
- visibility,
- object,
- prim_addr,
- ray->time,
- curve_type,
- lcg_state,
- difl,
- extmax);
+ hit = cardinal_curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ lcg_state,
+ difl,
+ extmax);
}
else {
- hit = bvh_curve_intersect(kg,
- isect,
- P,
- dir,
- visibility,
- object,
- prim_addr,
- ray->time,
- curve_type,
- lcg_state,
- difl,
- extmax);
+ hit = curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type,
+ lcg_state,
+ difl,
+ extmax);
}
if(hit) {
tfar = ssef(isect->t);
/* Shadow ray early termination. */
- if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+ if(visibility & PATH_RAY_SHADOW_OPAQUE) {
return true;
}
}
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 22d0092093a..2f2c35d5d1f 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
}
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy)
{
if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
- const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
- const float phi = M_2PI_F * randU.y;
+ const float r = sqrtf(randx / max(1.0f - randx, 1e-7f));
+ const float phi = M_2PI_F * randy;
return make_float2(r*cosf(phi), r*sinf(phi));
}
- const float sinI = sqrtf(1.0f - cosI*cosI);
+ const float sinI = safe_sqrtf(1.0f - cosI*cosI);
const float tanI = sinI/cosI;
const float projA = 0.5f * (cosI + 1.0f);
if(projA < 0.0001f)
return make_float2(0.0f, 0.0f);
- const float A = 2.0f*randU.x*projA / cosI - 1.0f;
+ const float A = 2.0f*randx*projA / cosI - 1.0f;
float tmp = A*A-1.0f;
if(fabsf(tmp) < 1e-7f)
return make_float2(0.0f, 0.0f);
@@ -64,24 +64,24 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran
const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2;
float U2;
- if(randU.y >= 0.5f)
- U2 = 2.0f*(randU.y - 0.5f);
+ if(randy >= 0.5f)
+ U2 = 2.0f*(randy - 0.5f);
else
- U2 = 2.0f*(0.5f - randU.y);
+ U2 = 2.0f*(0.5f - randy);
const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f);
const float slopeY = z * sqrtf(1.0f + slopeX*slopeX);
- if(randU.y >= 0.5f)
+ if(randy >= 0.5f)
return make_float2(slopeX, slopeY);
else
return make_float2(slopeX, -slopeY);
}
/* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy)
{
const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
- const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
+ const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy);
const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f));
const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y);
@@ -474,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
*eval *= *pdf;
*omega_in = X*localO.x + Y*localO.y + Z*localO.z;
+
#ifdef __RAY_DIFFERENTIALS__
*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 2eb2457c9e5..e73915dbda7 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -100,11 +100,14 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
bool outside = true;
for(int order = 0; order < 10; order++) {
- /* Sample microfacet height and normal */
- if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state)))
+ /* Sample microfacet height. */
+ float height_rand = lcg_step_float_addrspace(lcg_state);
+ if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand))
break;
- float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
- lcg_step_float_addrspace(lcg_state)));
+ /* Sample microfacet normal. */
+ float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+ float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+ float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
#ifdef MF_MULTI_GLASS
if(order == 0 && use_fresnel) {
@@ -136,7 +139,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
#ifdef MF_MULTI_GLASS
bool next_outside;
float3 wi_prev = -wr;
- wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+ float phase_rand = lcg_step_float_addrspace(lcg_state);
+ wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
if(!next_outside) {
outside = !outside;
wr = -wr;
@@ -204,14 +208,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
int order;
for(order = 0; order < 10; order++) {
/* Sample microfacet height. */
- if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) {
+ float height_rand = lcg_step_float_addrspace(lcg_state);
+ if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) {
/* The random walk has left the surface. */
*wo = outside? wr: -wr;
return throughput;
}
/* Sample microfacet normal. */
- float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
- lcg_step_float_addrspace(lcg_state)));
+ float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+ float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+ float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
/* First-bounce color is already accounted for in mix weight. */
if(!use_fresnel && order > 0)
@@ -221,7 +227,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
#ifdef MF_MULTI_GLASS
bool next_outside;
float3 wi_prev = -wr;
- wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+ float phase_rand = lcg_step_float_addrspace(lcg_state);
+ wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
if(!next_outside) {
hr = -hr;
wr = -wr;
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index f733ea4c517..267aeea6e86 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -348,8 +348,9 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
{
Bssrdf *bssrdf = (Bssrdf*)closure_alloc(sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight);
- if(!bssrdf)
+ if(bssrdf == NULL) {
return NULL;
+ }
float sample_weight = fabsf(average(weight));
bssrdf->sample_weight = sample_weight;
@@ -399,7 +400,7 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
bssrdf_burley_setup(bssrdf);
}
- return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
+ return SD_BSSRDF;
}
}
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 3185330994c..3ddd8712266 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -16,7 +16,7 @@
CCL_NAMESPACE_BEGIN
-#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
* pixel_buffer always points to the first of the 4 current pixel in the first pass.
@@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN
#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
- __m128 y4 = _mm_set1_ps(pixel.y); \
+ float4 y4 = make_float4(pixel.y); \
for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
- __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
- __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+ float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+ int4 active_pixels = x4 < make_float4(high.x);
#define END_FOR_PIXEL_WINDOW_SSE } \
pixel_buffer += buffer_w - (pixel.x - low.x); \
}
-ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
- __m128 active_pixels,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+ int4 active_pixels,
const float *ccl_restrict buffer,
- __m128 *features,
- const __m128 *ccl_restrict mean,
+ float4 *features,
+ const float4 *ccl_restrict mean,
int pass_stride)
{
features[0] = x;
features[1] = y;
- features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+ features[2] = fabs(ccl_get_feature_sse(0));
features[3] = ccl_get_feature_sse(1);
features[4] = ccl_get_feature_sse(2);
features[5] = ccl_get_feature_sse(3);
@@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
features[9] = ccl_get_feature_sse(7);
if(mean) {
for(int i = 0; i < DENOISE_FEATURES; i++)
- features[i] = _mm_sub_ps(features[i], mean[i]);
+ features[i] = features[i] - mean[i];
}
for(int i = 0; i < DENOISE_FEATURES; i++)
- features[i] = _mm_mask_ps(features[i], active_pixels);
+ features[i] = mask(active_pixels, features[i]);
}
-ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
- __m128 active_pixels,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+ int4 active_pixels,
const float *ccl_restrict buffer,
- __m128 *scales,
- const __m128 *ccl_restrict mean,
+ float4 *scales,
+ const float4 *ccl_restrict mean,
int pass_stride)
{
- scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
- scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
-
- scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
-
- __m128 diff, scale;
- diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
- scale = _mm_mul_ps(diff, diff);
- diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
- scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
- diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
- scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
- scales[3] = _mm_mask_ps(scale, active_pixels);
-
- scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
-
- diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
- scale = _mm_mul_ps(diff, diff);
- diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
- scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
- diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
- scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
- scales[5] = _mm_mask_ps(scale, active_pixels);
+ scales[0] = fabs(x - mean[0]);
+ scales[1] = fabs(y - mean[1]);
+ scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+ scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
+ sqr(ccl_get_feature_sse(2) - mean[4]) +
+ sqr(ccl_get_feature_sse(3) - mean[5]);
+ scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+ scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
+ sqr(ccl_get_feature_sse(6) - mean[8]) +
+ sqr(ccl_get_feature_sse(7) - mean[9]);
+ for(int i = 0; i < 6; i++)
+ scales[i] = mask(active_pixels, scales[i]);
}
-ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
{
- scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
- scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
- scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
- scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
-
- scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
- scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+ scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+ scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+ scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+ scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+ scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+ scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
}
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 3e752bce68f..5e989331bc2 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
int w,
int f)
{
-#ifdef __KERNEL_SSE3__
- int aligned_lowx = (rect.x & ~(3));
- int aligned_highx = ((rect.z + 3) & ~(3));
-#endif
+ int aligned_lowx = rect.x / 4;
+ int aligned_highx = (rect.z + 3) / 4;
for(int y = rect.y; y < rect.w; y++) {
const int low = max(rect.y, y-f);
const int high = min(rect.w, y+f+1);
@@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
out_image[y*w+x] = 0.0f;
}
for(int y1 = low; y1 < high; y1++) {
-#ifdef __KERNEL_SSE3__
- for(int x = aligned_lowx; x < aligned_highx; x+=4) {
- _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+ float4* out_image4 = (float4*)(out_image + y*w);
+ float4* difference_image4 = (float4*)(difference_image + y1*w);
+ for(int x = aligned_lowx; x < aligned_highx; x++) {
+ out_image4[x] += difference_image4[x];
}
-#else
- for(int x = rect.x; x < rect.z; x++) {
- out_image[y*w+x] += difference_image[y1*w+x];
- }
-#endif
}
for(int x = rect.x; x < rect.z; x++) {
out_image[y*w+x] *= 1.0f/(high - low);
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index d5ae1b73927..2aeb54a62be 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -61,8 +61,8 @@ ccl_device void kernel_filter_divide_shadow(int sample,
varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
}
- varA /= (odd_sample - 1);
- varB /= (even_sample - 1);
+ varA /= max(odd_sample - 1, 1);
+ varB /= max(even_sample - 1, 1);
sampleVariance[idx] = 0.5f*(varA + varB) / sample;
sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
@@ -96,11 +96,17 @@ ccl_device void kernel_filter_get_feature(int sample,
int idx = (y-rect.y)*buffer_w + (x - rect.x);
mean[idx] = center_buffer[m_offset] / sample;
- if(use_split_variance) {
- variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+ if(sample > 1) {
+ if(use_split_variance) {
+ variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+ }
+ else {
+ variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+ }
}
else {
- variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+ /* Can't compute variance with single sample, just set it very high. */
+ variance[idx] = 1e10f;
}
}
@@ -114,49 +120,56 @@ ccl_device void kernel_filter_detect_outliers(int x, int y,
{
int buffer_w = align_up(rect.z - rect.x, 4);
- int n = 0;
- float values[25];
- for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
- for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
- int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
- float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
-
- /* Find the position of L. */
- int i;
- for(i = 0; i < n; i++) {
- if(values[i] > L) break;
- }
- /* Make space for L by shifting all following values to the right. */
- for(int j = n; j > i; j--) {
- values[j] = values[j-1];
- }
- /* Insert L. */
- values[i] = L;
- n++;
- }
- }
-
int idx = (y-rect.y)*buffer_w + (x-rect.x);
- float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+ float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]);
- float ref = 2.0f*values[(int)(n*0.75f)];
float fac = 1.0f;
- if(L > ref) {
- /* The pixel appears to be an outlier.
- * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
- * should actually be at the reference value:
- * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
- * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
- */
- float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
- if(L - 3*stddev < ref) {
- /* The pixel is an outlier, so negate the depth value to mark it as one.
- * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
- depth[idx] = -depth[idx];
- fac = ref/L;
- variance[idx ] *= fac*fac;
- variance[idx + pass_stride] *= fac*fac;
- variance[idx+2*pass_stride] *= fac*fac;
+ if(color.x < 0.0f || color.y < 0.0f || color.z < 0.0f) {
+ depth[idx] = -depth[idx];
+ fac = 0.0f;
+ }
+ else {
+ float L = average(color);
+ int n = 0;
+ float values[25];
+ for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
+ for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
+ int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
+ float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+ /* Find the position of L. */
+ int i;
+ for(i = 0; i < n; i++) {
+ if(values[i] > L) break;
+ }
+ /* Make space for L by shifting all following values to the right. */
+ for(int j = n; j > i; j--) {
+ values[j] = values[j-1];
+ }
+ /* Insert L. */
+ values[i] = L;
+ n++;
+ }
+ }
+
+ float ref = 2.0f*values[(int)(n*0.75f)];
+ if(L > ref) {
+ /* The pixel appears to be an outlier.
+ * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+ * should actually be at the reference value:
+ * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+ * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+ */
+ float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
+ if(L - 3*stddev < ref) {
+ /* The pixel is an outlier, so negate the depth value to mark it as one.
+ * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+ depth[idx] = -depth[idx];
+ fac = ref/L;
+ variance[idx ] *= fac*fac;
+ variance[idx + pass_stride] *= fac*fac;
+ variance[idx+2*pass_stride] *= fac*fac;
+ }
}
}
out[idx ] = fac*image[idx];
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 30dc2969b11..9e65f61664b 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
{
int buffer_w = align_up(rect.z - rect.x, 4);
- __m128 features[DENOISE_FEATURES];
+ float4 features[DENOISE_FEATURES];
const float *ccl_restrict pixel_buffer;
int2 pixel;
@@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
min(rect.w, y + radius + 1));
int num_pixels = (high.y - low.y) * (high.x - low.x);
- __m128 feature_means[DENOISE_FEATURES];
+ float4 feature_means[DENOISE_FEATURES];
math_vector_zero_sse(feature_means, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
} END_FOR_PIXEL_WINDOW_SSE
- __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+ float4 pixel_scale = make_float4(1.0f / num_pixels);
for(int i = 0; i < DENOISE_FEATURES; i++) {
- feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+ feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
}
- __m128 feature_scale[DENOISE_FEATURES];
+ float4 feature_scale[DENOISE_FEATURES];
math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
@@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
filter_calculate_scale_sse(feature_scale);
- __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+ float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
- math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+ math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
} END_FOR_PIXEL_WINDOW_SSE
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
@@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
/* Bake the feature scaling into the transformation matrix. */
for(int i = 0; i < DENOISE_FEATURES; i++) {
- math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+ math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
}
}
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index c623e3490fd..f34b77ebc07 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -27,6 +27,7 @@
#include "kernel/geom/geom_motion_triangle_shader.h"
#include "kernel/geom/geom_motion_curve.h"
#include "kernel/geom/geom_curve.h"
+#include "kernel/geom/geom_curve_intersect.h"
#include "kernel/geom/geom_volume.h"
#include "kernel/geom/geom_primitive.h"
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 5c3b0ee3c15..e35267f02bf 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -16,18 +16,13 @@ CCL_NAMESPACE_BEGIN
/* Curve Primitive
*
- * Curve primitive for rendering hair and fur. These can be render as flat ribbons
- * or curves with actual thickness. The curve can also be rendered as line segments
- * rather than curves for better performance */
+ * Curve primitive for rendering hair and fur. These can be render as flat
+ * ribbons or curves with actual thickness. The curve can also be rendered as
+ * line segments rather than curves for better performance.
+ */
#ifdef __HAIR__
-#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
-# define ccl_device_curveintersect ccl_device
-#else
-# define ccl_device_curveintersect ccl_device_forceinline
-#endif
-
/* Reading attributes on various curve elements */
ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
@@ -151,7 +146,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
/* Curve tangent normal */
ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
-{
+{
float3 tgN = make_float3(0.0f,0.0f,0.0f);
if(sd->type & PRIMITIVE_ALL_CURVE) {
@@ -219,893 +214,6 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta,
}
}
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
-{
- return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
-}
-#endif
-
-#ifdef __KERNEL_SSE2__
-/* Pass P and dir by reference to aligned vector */
-ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
- const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
-#else
-ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
- float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
-#endif
-{
- const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
-
- if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
- const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
- if(time < prim_time.x || time > prim_time.y) {
- return false;
- }
- }
-
- int segment = PRIMITIVE_UNPACK_SEGMENT(type);
- float epsilon = 0.0f;
- float r_st, r_en;
-
- int depth = kernel_data.curve.subdivisions;
- int flags = kernel_data.curve.curveflags;
- int prim = kernel_tex_fetch(__prim_index, curveAddr);
-
-#ifdef __KERNEL_SSE2__
- ssef vdir = load4f(dir);
- ssef vcurve_coef[4];
- const float3 *curve_coef = (float3 *)vcurve_coef;
-
- {
- ssef dtmp = vdir * vdir;
- ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
- ssef rd_ss = load1f_first(1.0f) / d_ss;
-
- ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
- int2 &v00 = (int2 &)v00vec;
-
- int k0 = v00.x + segment;
- int k1 = k0 + 1;
- int ka = max(k0 - 1, v00.x);
- int kb = min(k1 + 1, v00.x + v00.y - 1);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
- avxf P_curve_0_1, P_curve_2_3;
- if(is_curve_primitive) {
- P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
- P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
- }
- else {
- int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
- motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
- }
-#else /* __KERNEL_AVX2__ */
- ssef P_curve[4];
-
- if(is_curve_primitive) {
- P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
- P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
- P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
- P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
- }
- else {
- int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
- motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
- }
-#endif /* __KERNEL_AVX2__ */
-
- ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
- ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
- ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
- ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
- ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
-
- ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
- ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
- ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
- const avxf vPP = _mm256_broadcast_ps(&P.m128);
- const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
- const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
- const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
-
- const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
- htfm00,
- madd(shuffle<1>(P_curve_0_1 - vPP),
- htfm11,
- shuffle<2>(P_curve_0_1 - vPP) * htfm22));
- const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
- htfm00,
- madd(shuffle<1>(P_curve_2_3 - vPP),
- htfm11,
- shuffle<2>(P_curve_2_3 - vPP)*htfm22));
-
- const ssef p0 = _mm256_castps256_ps128(p01);
- const ssef p1 = _mm256_extractf128_ps(p01, 1);
- const ssef p2 = _mm256_castps256_ps128(p23);
- const ssef p3 = _mm256_extractf128_ps(p23, 1);
-
- const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
- r_st = ((float4 &)P_curve_1).w;
- const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
- r_en = ((float4 &)P_curve_2).w;
-#else /* __KERNEL_AVX2__ */
- ssef htfm[] = { htfm0, htfm1, htfm2 };
- ssef vP = load4f(P);
- ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
- ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
- ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
- ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
-
- r_st = ((float4 &)P_curve[1]).w;
- r_en = ((float4 &)P_curve[2]).w;
-#endif /* __KERNEL_AVX2__ */
-
- float fc = 0.71f;
- ssef vfc = ssef(fc);
- ssef vfcxp3 = vfc * p3;
-
- vcurve_coef[0] = p1;
- vcurve_coef[1] = vfc * (p2 - p0);
- vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
- vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
-
- }
-#else
- float3 curve_coef[4];
-
- /* curve Intersection check */
- /* obtain curve parameters */
- {
- /* ray transform created - this should be created at beginning of intersection loop */
- Transform htfm;
- float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
- htfm = make_transform(
- dir.z / d, 0, -dir.x /d, 0,
- -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
- dir.x, dir.y, dir.z, 0,
- 0, 0, 0, 1);
-
- float4 v00 = kernel_tex_fetch(__curves, prim);
-
- int k0 = __float_as_int(v00.x) + segment;
- int k1 = k0 + 1;
-
- int ka = max(k0 - 1,__float_as_int(v00.x));
- int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
- float4 P_curve[4];
-
- if(is_curve_primitive) {
- P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
- P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
- P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
- P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
- }
- else {
- int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
- motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
- }
-
- float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
- float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
- float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
- float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
-
- float fc = 0.71f;
- curve_coef[0] = p1;
- curve_coef[1] = -fc*p0 + fc*p2;
- curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
- curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
- r_st = P_curve[1].w;
- r_en = P_curve[2].w;
- }
-#endif
-
- float r_curr = max(r_st, r_en);
-
- if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
- epsilon = 2 * r_curr;
-
- /* find bounds - this is slow for cubic curves */
- float upper, lower;
-
- float zextrem[4];
- curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
- if(lower - r_curr > isect->t || upper + r_curr < epsilon)
- return false;
-
- /* minimum width extension */
- float mw_extension = min(difl * fabsf(upper), extmax);
- float r_ext = mw_extension + r_curr;
-
- float xextrem[4];
- curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
- if(lower > r_ext || upper < -r_ext)
- return false;
-
- float yextrem[4];
- curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
- if(lower > r_ext || upper < -r_ext)
- return false;
-
- /* setup recurrent loop */
- int level = 1 << depth;
- int tree = 0;
- float resol = 1.0f / (float)level;
- bool hit = false;
-
- /* begin loop */
- while(!(tree >> (depth))) {
- const float i_st = tree * resol;
- const float i_en = i_st + (level * resol);
-
-#ifdef __KERNEL_SSE2__
- ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
- ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
- ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
-
- ssef vbmin = min(vp_st, vp_en);
- ssef vbmax = max(vp_st, vp_en);
-
- float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
- float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
- float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
- float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
-#else
- float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
- float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
-
- float bminx = min(p_st.x, p_en.x);
- float bmaxx = max(p_st.x, p_en.x);
- float bminy = min(p_st.y, p_en.y);
- float bmaxy = max(p_st.y, p_en.y);
- float bminz = min(p_st.z, p_en.z);
- float bmaxz = max(p_st.z, p_en.z);
-#endif
-
- if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
- bminx = min(bminx,xextrem[1]);
- bmaxx = max(bmaxx,xextrem[1]);
- }
- if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
- bminx = min(bminx,xextrem[3]);
- bmaxx = max(bmaxx,xextrem[3]);
- }
- if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
- bminy = min(bminy,yextrem[1]);
- bmaxy = max(bmaxy,yextrem[1]);
- }
- if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
- bminy = min(bminy,yextrem[3]);
- bmaxy = max(bmaxy,yextrem[3]);
- }
- if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
- bminz = min(bminz,zextrem[1]);
- bmaxz = max(bmaxz,zextrem[1]);
- }
- if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
- bminz = min(bminz,zextrem[3]);
- bmaxz = max(bmaxz,zextrem[3]);
- }
-
- float r1 = r_st + (r_en - r_st) * i_st;
- float r2 = r_st + (r_en - r_st) * i_en;
- r_curr = max(r1, r2);
-
- mw_extension = min(difl * fabsf(bmaxz), extmax);
- float r_ext = mw_extension + r_curr;
- float coverage = 1.0f;
-
- if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
- /* the bounding box does not overlap the square centered at O */
- tree += level;
- level = tree & -tree;
- }
- else if(level == 1) {
-
- /* the maximum recursion depth is reached.
- * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
- * dP* is reversed if necessary.*/
- float t = isect->t;
- float u = 0.0f;
- float gd = 0.0f;
-
- if(flags & CURVE_KN_RIBBONS) {
- float3 tg = (p_en - p_st);
-#ifdef __KERNEL_SSE__
- const float3 tg_sq = tg * tg;
- float w = tg_sq.x + tg_sq.y;
-#else
- float w = tg.x * tg.x + tg.y * tg.y;
-#endif
- if(w == 0) {
- tree++;
- level = tree & -tree;
- continue;
- }
-#ifdef __KERNEL_SSE__
- const float3 p_sttg = p_st * tg;
- w = -(p_sttg.x + p_sttg.y) / w;
-#else
- w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-#endif
- w = saturate(w);
-
- /* compute u on the curve segment */
- u = i_st * (1 - w) + i_en * w;
- r_curr = r_st + (r_en - r_st) * u;
- /* compare x-y distances */
- float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
-
- float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
- if(dot(tg, dp_st)< 0)
- dp_st *= -1;
- if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
- tree++;
- level = tree & -tree;
- continue;
- }
- float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
- if(dot(tg, dp_en) < 0)
- dp_en *= -1;
- if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
- tree++;
- level = tree & -tree;
- continue;
- }
-
- /* compute coverage */
- float r_ext = r_curr;
- coverage = 1.0f;
- if(difl != 0.0f) {
- mw_extension = min(difl * fabsf(bmaxz), extmax);
- r_ext = mw_extension + r_curr;
-#ifdef __KERNEL_SSE__
- const float3 p_curr_sq = p_curr * p_curr;
- const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
- float d = dxxx.x;
-#else
- float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
-#endif
- float d0 = d - r_curr;
- float d1 = d + r_curr;
- float inv_mw_extension = 1.0f/mw_extension;
- if(d0 >= 0)
- coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
- else // inside
- coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
- }
-
- if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
- tree++;
- level = tree & -tree;
- continue;
- }
-
- t = p_curr.z;
-
- /* stochastic fade from minimum width */
- if(difl != 0.0f && lcg_state) {
- if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
- return hit;
- }
- }
- else {
- float l = len(p_en - p_st);
- /* minimum width extension */
- float or1 = r1;
- float or2 = r2;
-
- if(difl != 0.0f) {
- mw_extension = min(len(p_st - P) * difl, extmax);
- or1 = r1 < mw_extension ? mw_extension : r1;
- mw_extension = min(len(p_en - P) * difl, extmax);
- or2 = r2 < mw_extension ? mw_extension : r2;
- }
- /* --- */
- float invl = 1.0f/l;
- float3 tg = (p_en - p_st) * invl;
- gd = (or2 - or1) * invl;
- float difz = -dot(p_st,tg);
- float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
- float invcyla = 1.0f/cyla;
- float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
- float tcentre = -halfb*invcyla;
- float zcentre = difz + (tg.z * tcentre);
- float3 tdif = - p_st;
- tdif.z += tcentre;
- float tdifz = dot(tdif,tg);
- float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
- float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
- float td = tb*tb - 4*cyla*tc;
- if(td < 0.0f) {
- tree++;
- level = tree & -tree;
- continue;
- }
-
- float rootd = sqrtf(td);
- float correction = (-tb - rootd) * 0.5f * invcyla;
- t = tcentre + correction;
-
- float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
- if(dot(tg, dp_st)< 0)
- dp_st *= -1;
- float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
- if(dot(tg, dp_en) < 0)
- dp_en *= -1;
-
- if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
- correction = (-tb + rootd) * 0.5f * invcyla;
- t = tcentre + correction;
- }
-
- if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
- tree++;
- level = tree & -tree;
- continue;
- }
-
- float w = (zcentre + (tg.z * correction)) * invl;
- w = saturate(w);
- /* compute u on the curve segment */
- u = i_st * (1 - w) + i_en * w;
-
- /* stochastic fade from minimum width */
- if(difl != 0.0f && lcg_state) {
- r_curr = r1 + (r2 - r1) * w;
- r_ext = or1 + (or2 - or1) * w;
- coverage = r_curr/r_ext;
-
- if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
- return hit;
- }
- }
- /* we found a new intersection */
-
-#ifdef __VISIBILITY_FLAG__
- /* visibility flag test. we do it here under the assumption
- * that most triangles are culled by node flags */
- if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
- {
- /* record intersection */
- isect->t = t;
- isect->u = u;
- isect->v = gd;
- isect->prim = curveAddr;
- isect->object = object;
- isect->type = type;
- hit = true;
- }
-
- tree++;
- level = tree & -tree;
- }
- else {
- /* split the curve into two curves and process */
- level = level >> 1;
- }
- }
-
- return hit;
-}
-
-ccl_device_curveintersect bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
- float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
-{
- /* define few macros to minimize code duplication for SSE */
-#ifndef __KERNEL_SSE2__
-# define len3_squared(x) len_squared(x)
-# define len3(x) len(x)
-# define dot3(x, y) dot(x, y)
-#endif
-
- const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
-
- if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
- const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
- if(time < prim_time.x || time > prim_time.y) {
- return false;
- }
- }
-
- int segment = PRIMITIVE_UNPACK_SEGMENT(type);
- /* curve Intersection check */
- int flags = kernel_data.curve.curveflags;
-
- int prim = kernel_tex_fetch(__prim_index, curveAddr);
- float4 v00 = kernel_tex_fetch(__curves, prim);
-
- int cnum = __float_as_int(v00.x);
- int k0 = cnum + segment;
- int k1 = k0 + 1;
-
-#ifndef __KERNEL_SSE2__
- float4 P_curve[2];
-
- if(is_curve_primitive) {
- P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
- P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
- }
- else {
- int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
- motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
- }
-
- float or1 = P_curve[0].w;
- float or2 = P_curve[1].w;
- float3 p1 = float4_to_float3(P_curve[0]);
- float3 p2 = float4_to_float3(P_curve[1]);
-
- /* minimum width extension */
- float r1 = or1;
- float r2 = or2;
- float3 dif = P - p1;
- float3 dif_second = P - p2;
- if(difl != 0.0f) {
- float pixelsize = min(len3(dif) * difl, extmax);
- r1 = or1 < pixelsize ? pixelsize : or1;
- pixelsize = min(len3(dif_second) * difl, extmax);
- r2 = or2 < pixelsize ? pixelsize : or2;
- }
- /* --- */
-
- float3 p21_diff = p2 - p1;
- float3 sphere_dif1 = (dif + dif_second) * 0.5f;
- float3 dir = direction;
- float sphere_b_tmp = dot3(dir, sphere_dif1);
- float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
-#else
- ssef P_curve[2];
-
- if(is_curve_primitive) {
- P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
- P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
- }
- else {
- int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
- motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
- }
-
- const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
-
- ssef r12 = or12;
- const ssef vP = load4f(P);
- const ssef dif = vP - P_curve[0];
- const ssef dif_second = vP - P_curve[1];
- if(difl != 0.0f) {
- const ssef len1_sq = len3_squared_splat(dif);
- const ssef len2_sq = len3_squared_splat(dif_second);
- const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
- const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
- r12 = max(or12, pixelsize12);
- }
- float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
- float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
-
- const ssef p21_diff = P_curve[1] - P_curve[0];
- const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
- const ssef dir = load4f(direction);
- const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
- const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
-#endif
-
- float mr = max(r1, r2);
- float l = len3(p21_diff);
- float invl = 1.0f / l;
- float sp_r = mr + 0.5f * l;
-
- float sphere_b = dot3(dir, sphere_dif2);
- float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
-
- if(sdisc < 0.0f)
- return false;
-
- /* obtain parameters and test midpoint distance for suitable modes */
-#ifndef __KERNEL_SSE2__
- float3 tg = p21_diff * invl;
-#else
- const ssef tg = p21_diff * invl;
-#endif
- float gd = (r2 - r1) * invl;
-
- float dirz = dot3(dir, tg);
- float difz = dot3(dif, tg);
-
- float a = 1.0f - (dirz*dirz*(1 + gd*gd));
-
- float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
-
- float tcentre = -halfb/a;
- float zcentre = difz + (dirz * tcentre);
-
- if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
- return false;
- if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
- return false;
-
- /* test minimum separation */
-#ifndef __KERNEL_SSE2__
- float3 cprod = cross(tg, dir);
- float cprod2sq = len3_squared(cross(tg, dif));
-#else
- const ssef cprod = cross(tg, dir);
- float cprod2sq = len3_squared(cross_zxy(tg, dif));
-#endif
- float cprodsq = len3_squared(cprod);
- float distscaled = dot3(cprod, dif);
-
- if(cprodsq == 0)
- distscaled = cprod2sq;
- else
- distscaled = (distscaled*distscaled)/cprodsq;
-
- if(distscaled > mr*mr)
- return false;
-
- /* calculate true intersection */
-#ifndef __KERNEL_SSE2__
- float3 tdif = dif + tcentre * dir;
-#else
- const ssef tdif = madd(ssef(tcentre), dir, dif);
-#endif
- float tdifz = dot3(tdif, tg);
- float tdifma = tdifz*gd + r1;
- float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
- float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
- float td = tb*tb - 4*a*tc;
-
- if(td < 0.0f)
- return false;
-
- float rootd = 0.0f;
- float correction = 0.0f;
- if(flags & CURVE_KN_ACCURATE) {
- rootd = sqrtf(td);
- correction = ((-tb - rootd)/(2*a));
- }
-
- float t = tcentre + correction;
-
- if(t < isect->t) {
-
- if(flags & CURVE_KN_INTERSECTCORRECTION) {
- rootd = sqrtf(td);
- correction = ((-tb - rootd)/(2*a));
- t = tcentre + correction;
- }
-
- float z = zcentre + (dirz * correction);
- // bool backface = false;
-
- if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
- // backface = true;
- correction = ((-tb + rootd)/(2*a));
- t = tcentre + correction;
- z = zcentre + (dirz * correction);
- }
-
- /* stochastic fade from minimum width */
- float adjradius = or1 + z * (or2 - or1) * invl;
- adjradius = adjradius / (r1 + z * gd);
- if(lcg_state && adjradius != 1.0f) {
- if(lcg_step_float(lcg_state) > adjradius)
- return false;
- }
- /* --- */
-
- if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
-
- if(flags & CURVE_KN_ENCLOSEFILTER) {
- float enc_ratio = 1.01f;
- if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
- float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
- float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
- if(a2*c2 < 0.0f)
- return false;
- }
- }
-
-#ifdef __VISIBILITY_FLAG__
- /* visibility flag test. we do it here under the assumption
- * that most triangles are culled by node flags */
- if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
- {
- /* record intersection */
- isect->t = t;
- isect->u = z*invl;
- isect->v = gd;
- isect->prim = curveAddr;
- isect->object = object;
- isect->type = type;
-
- return true;
- }
- }
- }
-
- return false;
-
-#ifndef __KERNEL_SSE2__
-# undef len3_squared
-# undef len3
-# undef dot3
-#endif
-}
-
-ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
- float fc = 0.71f;
- float data[4];
- float t2 = t * t;
- data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc;
- data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t;
- data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc;
- data[3] = 3.0f * fc * t2 - 2.0f * fc * t;
- return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
- float data[4];
- float fc = 0.71f;
- float t2 = t * t;
- float t3 = t2 * t;
- data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t;
- data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f;
- data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t;
- data[3] = fc * t3 - fc * t2;
- return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
- int flag = kernel_data.curve.curveflags;
- float t = isect->t;
- float3 P = ray->P;
- float3 D = ray->D;
-
- if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_itfm;
-#else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
- P = transform_point(&tfm, P);
- D = transform_direction(&tfm, D*t);
- D = normalize_len(D, &t);
- }
-
- int prim = kernel_tex_fetch(__prim_index, isect->prim);
- float4 v00 = kernel_tex_fetch(__curves, prim);
-
- int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
- int k1 = k0 + 1;
-
- float3 tg;
-
- if(flag & CURVE_KN_INTERPOLATE) {
- int ka = max(k0 - 1,__float_as_int(v00.x));
- int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
- float4 P_curve[4];
-
- if(sd->type & PRIMITIVE_CURVE) {
- P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
- P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
- P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
- P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
- }
- else {
- motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
- }
-
- float3 p[4];
- p[0] = float4_to_float3(P_curve[0]);
- p[1] = float4_to_float3(P_curve[1]);
- p[2] = float4_to_float3(P_curve[2]);
- p[3] = float4_to_float3(P_curve[3]);
-
- P = P + D*t;
-
-#ifdef __UV__
- sd->u = isect->u;
- sd->v = 0.0f;
-#endif
-
- tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
-
- if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
- sd->Ng = normalize(-(D - tg * (dot(tg, D))));
- }
- else {
- /* direction from inside to surface of curve */
- float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
- sd->Ng = normalize(P - p_curr);
-
- /* adjustment for changing radius */
- float gd = isect->v;
-
- if(gd != 0.0f) {
- sd->Ng = sd->Ng - gd * tg;
- sd->Ng = normalize(sd->Ng);
- }
- }
-
- /* todo: sometimes the normal is still so that this is detected as
- * backfacing even if cull backfaces is enabled */
-
- sd->N = sd->Ng;
- }
- else {
- float4 P_curve[2];
-
- if(sd->type & PRIMITIVE_CURVE) {
- P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
- P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
- }
- else {
- motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
- }
-
- float l = 1.0f;
- tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
-
- P = P + D*t;
-
- float3 dif = P - float4_to_float3(P_curve[0]);
-
-#ifdef __UV__
- sd->u = dot(dif,tg)/l;
- sd->v = 0.0f;
-#endif
-
- if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
- sd->Ng = -(D - tg * dot(tg, D));
- sd->Ng = normalize(sd->Ng);
- }
- else {
- float gd = isect->v;
-
- /* direction from inside to surface of curve */
- sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
-
- /* adjustment for changing radius */
- if(gd != 0.0f) {
- sd->Ng = sd->Ng - gd * tg;
- sd->Ng = normalize(sd->Ng);
- }
- }
-
- sd->N = sd->Ng;
- }
-
-#ifdef __DPDU__
- /* dPdu/dPdv */
- sd->dPdu = tg;
- sd->dPdv = cross(tg, sd->Ng);
-#endif
-
- if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_tfm;
-#else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
- P = transform_point(&tfm, P);
- }
-
- return P;
-}
-
-#endif
+#endif /* __HAIR__ */
CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
new file mode 100644
index 00000000000..e9a149ea1ab
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -0,0 +1,934 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Curve primitive intersection functions. */
+
+#ifdef __HAIR__
+
+#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
+# define ccl_device_curveintersect ccl_device
+#else
+# define ccl_device_curveintersect ccl_device_forceinline
+#endif
+
+#ifdef __KERNEL_SSE2__
+ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
+{
+ return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
+}
+#endif
+
+/* On CPU pass P and dir by reference to aligned vector. */
+ccl_device_curveintersect bool cardinal_curve_intersect(
+ KernelGlobals *kg,
+ Intersection *isect,
+ const float3 ccl_ref P,
+ const float3 ccl_ref dir,
+ uint visibility,
+ int object,
+ int curveAddr,
+ float time,
+ int type,
+ uint *lcg_state,
+ float difl,
+ float extmax)
+{
+ const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+ if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+ const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+ if(time < prim_time.x || time > prim_time.y) {
+ return false;
+ }
+ }
+
+ int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+ float epsilon = 0.0f;
+ float r_st, r_en;
+
+ int depth = kernel_data.curve.subdivisions;
+ int flags = kernel_data.curve.curveflags;
+ int prim = kernel_tex_fetch(__prim_index, curveAddr);
+
+#ifdef __KERNEL_SSE2__
+ ssef vdir = load4f(dir);
+ ssef vcurve_coef[4];
+ const float3 *curve_coef = (float3 *)vcurve_coef;
+
+ {
+ ssef dtmp = vdir * vdir;
+ ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
+ ssef rd_ss = load1f_first(1.0f) / d_ss;
+
+ ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
+ int2 &v00 = (int2 &)v00vec;
+
+ int k0 = v00.x + segment;
+ int k1 = k0 + 1;
+ int ka = max(k0 - 1, v00.x);
+ int kb = min(k1 + 1, v00.x + v00.y - 1);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
+ avxf P_curve_0_1, P_curve_2_3;
+ if(is_curve_primitive) {
+ P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
+ P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
+ }
+ else {
+ int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
+ motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
+ }
+#else /* __KERNEL_AVX2__ */
+ ssef P_curve[4];
+
+ if(is_curve_primitive) {
+ P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
+ P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
+ P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
+ P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
+ }
+ else {
+ int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+ motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
+ }
+#endif /* __KERNEL_AVX2__ */
+
+ ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
+ ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
+ ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
+ ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
+ ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+
+ ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
+ ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
+ ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
+ const avxf vPP = _mm256_broadcast_ps(&P.m128);
+ const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
+ const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
+ const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
+
+ const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
+ htfm00,
+ madd(shuffle<1>(P_curve_0_1 - vPP),
+ htfm11,
+ shuffle<2>(P_curve_0_1 - vPP) * htfm22));
+ const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
+ htfm00,
+ madd(shuffle<1>(P_curve_2_3 - vPP),
+ htfm11,
+ shuffle<2>(P_curve_2_3 - vPP)*htfm22));
+
+ const ssef p0 = _mm256_castps256_ps128(p01);
+ const ssef p1 = _mm256_extractf128_ps(p01, 1);
+ const ssef p2 = _mm256_castps256_ps128(p23);
+ const ssef p3 = _mm256_extractf128_ps(p23, 1);
+
+ const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
+ r_st = ((float4 &)P_curve_1).w;
+ const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
+ r_en = ((float4 &)P_curve_2).w;
+#else /* __KERNEL_AVX2__ */
+ ssef htfm[] = { htfm0, htfm1, htfm2 };
+ ssef vP = load4f(P);
+ ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
+ ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
+ ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
+ ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
+
+ r_st = ((float4 &)P_curve[1]).w;
+ r_en = ((float4 &)P_curve[2]).w;
+#endif /* __KERNEL_AVX2__ */
+
+ float fc = 0.71f;
+ ssef vfc = ssef(fc);
+ ssef vfcxp3 = vfc * p3;
+
+ vcurve_coef[0] = p1;
+ vcurve_coef[1] = vfc * (p2 - p0);
+ vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
+ vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
+
+ }
+#else
+ float3 curve_coef[4];
+
+ /* curve Intersection check */
+ /* obtain curve parameters */
+ {
+ /* ray transform created - this should be created at beginning of intersection loop */
+ Transform htfm;
+ float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
+ htfm = make_transform(
+ dir.z / d, 0, -dir.x /d, 0,
+ -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
+ dir.x, dir.y, dir.z, 0,
+ 0, 0, 0, 1);
+
+ float4 v00 = kernel_tex_fetch(__curves, prim);
+
+ int k0 = __float_as_int(v00.x) + segment;
+ int k1 = k0 + 1;
+
+ int ka = max(k0 - 1,__float_as_int(v00.x));
+ int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+ float4 P_curve[4];
+
+ if(is_curve_primitive) {
+ P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+ P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+ P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+ P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+ }
+ else {
+ int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+ motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
+ }
+
+ float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
+ float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
+ float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
+ float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
+
+ float fc = 0.71f;
+ curve_coef[0] = p1;
+ curve_coef[1] = -fc*p0 + fc*p2;
+ curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
+ curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
+ r_st = P_curve[1].w;
+ r_en = P_curve[2].w;
+ }
+#endif
+
+ float r_curr = max(r_st, r_en);
+
+ if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
+ epsilon = 2 * r_curr;
+
+ /* find bounds - this is slow for cubic curves */
+ float upper, lower;
+
+ float zextrem[4];
+ curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
+ if(lower - r_curr > isect->t || upper + r_curr < epsilon)
+ return false;
+
+ /* minimum width extension */
+ float mw_extension = min(difl * fabsf(upper), extmax);
+ float r_ext = mw_extension + r_curr;
+
+ float xextrem[4];
+ curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
+ if(lower > r_ext || upper < -r_ext)
+ return false;
+
+ float yextrem[4];
+ curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
+ if(lower > r_ext || upper < -r_ext)
+ return false;
+
+ /* setup recurrent loop */
+ int level = 1 << depth;
+ int tree = 0;
+ float resol = 1.0f / (float)level;
+ bool hit = false;
+
+ /* begin loop */
+ while(!(tree >> (depth))) {
+ const float i_st = tree * resol;
+ const float i_en = i_st + (level * resol);
+
+#ifdef __KERNEL_SSE2__
+ ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
+ ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
+ ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
+
+ ssef vbmin = min(vp_st, vp_en);
+ ssef vbmax = max(vp_st, vp_en);
+
+ float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
+ float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
+ float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
+ float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
+#else
+ float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
+ float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
+
+ float bminx = min(p_st.x, p_en.x);
+ float bmaxx = max(p_st.x, p_en.x);
+ float bminy = min(p_st.y, p_en.y);
+ float bmaxy = max(p_st.y, p_en.y);
+ float bminz = min(p_st.z, p_en.z);
+ float bmaxz = max(p_st.z, p_en.z);
+#endif
+
+ if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
+ bminx = min(bminx,xextrem[1]);
+ bmaxx = max(bmaxx,xextrem[1]);
+ }
+ if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
+ bminx = min(bminx,xextrem[3]);
+ bmaxx = max(bmaxx,xextrem[3]);
+ }
+ if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
+ bminy = min(bminy,yextrem[1]);
+ bmaxy = max(bmaxy,yextrem[1]);
+ }
+ if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
+ bminy = min(bminy,yextrem[3]);
+ bmaxy = max(bmaxy,yextrem[3]);
+ }
+ if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
+ bminz = min(bminz,zextrem[1]);
+ bmaxz = max(bmaxz,zextrem[1]);
+ }
+ if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
+ bminz = min(bminz,zextrem[3]);
+ bmaxz = max(bmaxz,zextrem[3]);
+ }
+
+ float r1 = r_st + (r_en - r_st) * i_st;
+ float r2 = r_st + (r_en - r_st) * i_en;
+ r_curr = max(r1, r2);
+
+ mw_extension = min(difl * fabsf(bmaxz), extmax);
+ float r_ext = mw_extension + r_curr;
+ float coverage = 1.0f;
+
+ if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
+ /* the bounding box does not overlap the square centered at O */
+ tree += level;
+ level = tree & -tree;
+ }
+ else if(level == 1) {
+
+ /* the maximum recursion depth is reached.
+ * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
+ * dP* is reversed if necessary.*/
+ float t = isect->t;
+ float u = 0.0f;
+ float gd = 0.0f;
+
+ if(flags & CURVE_KN_RIBBONS) {
+ float3 tg = (p_en - p_st);
+#ifdef __KERNEL_SSE__
+ const float3 tg_sq = tg * tg;
+ float w = tg_sq.x + tg_sq.y;
+#else
+ float w = tg.x * tg.x + tg.y * tg.y;
+#endif
+ if(w == 0) {
+ tree++;
+ level = tree & -tree;
+ continue;
+ }
+#ifdef __KERNEL_SSE__
+ const float3 p_sttg = p_st * tg;
+ w = -(p_sttg.x + p_sttg.y) / w;
+#else
+ w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
+#endif
+ w = saturate(w);
+
+ /* compute u on the curve segment */
+ u = i_st * (1 - w) + i_en * w;
+ r_curr = r_st + (r_en - r_st) * u;
+ /* compare x-y distances */
+ float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
+
+ float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+ if(dot(tg, dp_st)< 0)
+ dp_st *= -1;
+ if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
+ tree++;
+ level = tree & -tree;
+ continue;
+ }
+ float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+ if(dot(tg, dp_en) < 0)
+ dp_en *= -1;
+ if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
+ tree++;
+ level = tree & -tree;
+ continue;
+ }
+
+ /* compute coverage */
+ float r_ext = r_curr;
+ coverage = 1.0f;
+ if(difl != 0.0f) {
+ mw_extension = min(difl * fabsf(bmaxz), extmax);
+ r_ext = mw_extension + r_curr;
+#ifdef __KERNEL_SSE__
+ const float3 p_curr_sq = p_curr * p_curr;
+ const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
+ float d = dxxx.x;
+#else
+ float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
+#endif
+ float d0 = d - r_curr;
+ float d1 = d + r_curr;
+ float inv_mw_extension = 1.0f/mw_extension;
+ if(d0 >= 0)
+ coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
+ else // inside
+ coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
+ }
+
+ if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
+ tree++;
+ level = tree & -tree;
+ continue;
+ }
+
+ t = p_curr.z;
+
+ /* stochastic fade from minimum width */
+ if(difl != 0.0f && lcg_state) {
+ if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+ return hit;
+ }
+ }
+ else {
+ float l = len(p_en - p_st);
+ /* minimum width extension */
+ float or1 = r1;
+ float or2 = r2;
+
+ if(difl != 0.0f) {
+ mw_extension = min(len(p_st - P) * difl, extmax);
+ or1 = r1 < mw_extension ? mw_extension : r1;
+ mw_extension = min(len(p_en - P) * difl, extmax);
+ or2 = r2 < mw_extension ? mw_extension : r2;
+ }
+ /* --- */
+ float invl = 1.0f/l;
+ float3 tg = (p_en - p_st) * invl;
+ gd = (or2 - or1) * invl;
+ float difz = -dot(p_st,tg);
+ float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
+ float invcyla = 1.0f/cyla;
+ float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
+ float tcentre = -halfb*invcyla;
+ float zcentre = difz + (tg.z * tcentre);
+ float3 tdif = - p_st;
+ tdif.z += tcentre;
+ float tdifz = dot(tdif,tg);
+ float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
+ float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
+ float td = tb*tb - 4*cyla*tc;
+ if(td < 0.0f) {
+ tree++;
+ level = tree & -tree;
+ continue;
+ }
+
+ float rootd = sqrtf(td);
+ float correction = (-tb - rootd) * 0.5f * invcyla;
+ t = tcentre + correction;
+
+ float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+ if(dot(tg, dp_st)< 0)
+ dp_st *= -1;
+ float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+ if(dot(tg, dp_en) < 0)
+ dp_en *= -1;
+
+ if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
+ correction = (-tb + rootd) * 0.5f * invcyla;
+ t = tcentre + correction;
+ }
+
+ if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
+ tree++;
+ level = tree & -tree;
+ continue;
+ }
+
+ float w = (zcentre + (tg.z * correction)) * invl;
+ w = saturate(w);
+ /* compute u on the curve segment */
+ u = i_st * (1 - w) + i_en * w;
+
+ /* stochastic fade from minimum width */
+ if(difl != 0.0f && lcg_state) {
+ r_curr = r1 + (r2 - r1) * w;
+ r_ext = or1 + (or2 - or1) * w;
+ coverage = r_curr/r_ext;
+
+ if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+ return hit;
+ }
+ }
+ /* we found a new intersection */
+
+#ifdef __VISIBILITY_FLAG__
+ /* visibility flag test. we do it here under the assumption
+ * that most triangles are culled by node flags */
+ if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+ {
+ /* record intersection */
+ isect->t = t;
+ isect->u = u;
+ isect->v = gd;
+ isect->prim = curveAddr;
+ isect->object = object;
+ isect->type = type;
+ hit = true;
+ }
+
+ tree++;
+ level = tree & -tree;
+ }
+ else {
+ /* split the curve into two curves and process */
+ level = level >> 1;
+ }
+ }
+
+ return hit;
+}
+
+ccl_device_curveintersect bool curve_intersect(KernelGlobals *kg,
+ Intersection *isect,
+ float3 P,
+ float3 direction,
+ uint visibility,
+ int object,
+ int curveAddr,
+ float time,
+ int type,
+ uint *lcg_state,
+ float difl,
+ float extmax)
+{
+ /* define few macros to minimize code duplication for SSE */
+#ifndef __KERNEL_SSE2__
+# define len3_squared(x) len_squared(x)
+# define len3(x) len(x)
+# define dot3(x, y) dot(x, y)
+#endif
+
+ const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+ if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+ const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+ if(time < prim_time.x || time > prim_time.y) {
+ return false;
+ }
+ }
+
+ int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+ /* curve Intersection check */
+ int flags = kernel_data.curve.curveflags;
+
+ int prim = kernel_tex_fetch(__prim_index, curveAddr);
+ float4 v00 = kernel_tex_fetch(__curves, prim);
+
+ int cnum = __float_as_int(v00.x);
+ int k0 = cnum + segment;
+ int k1 = k0 + 1;
+
+#ifndef __KERNEL_SSE2__
+ float4 P_curve[2];
+
+ if(is_curve_primitive) {
+ P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
+ P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
+ }
+ else {
+ int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+ motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
+ }
+
+ float or1 = P_curve[0].w;
+ float or2 = P_curve[1].w;
+ float3 p1 = float4_to_float3(P_curve[0]);
+ float3 p2 = float4_to_float3(P_curve[1]);
+
+ /* minimum width extension */
+ float r1 = or1;
+ float r2 = or2;
+ float3 dif = P - p1;
+ float3 dif_second = P - p2;
+ if(difl != 0.0f) {
+ float pixelsize = min(len3(dif) * difl, extmax);
+ r1 = or1 < pixelsize ? pixelsize : or1;
+ pixelsize = min(len3(dif_second) * difl, extmax);
+ r2 = or2 < pixelsize ? pixelsize : or2;
+ }
+ /* --- */
+
+ float3 p21_diff = p2 - p1;
+ float3 sphere_dif1 = (dif + dif_second) * 0.5f;
+ float3 dir = direction;
+ float sphere_b_tmp = dot3(dir, sphere_dif1);
+ float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
+#else
+ ssef P_curve[2];
+
+ if(is_curve_primitive) {
+ P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
+ P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
+ }
+ else {
+ int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+ motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
+ }
+
+ const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
+
+ ssef r12 = or12;
+ const ssef vP = load4f(P);
+ const ssef dif = vP - P_curve[0];
+ const ssef dif_second = vP - P_curve[1];
+ if(difl != 0.0f) {
+ const ssef len1_sq = len3_squared_splat(dif);
+ const ssef len2_sq = len3_squared_splat(dif_second);
+ const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
+ const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
+ r12 = max(or12, pixelsize12);
+ }
+ float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
+ float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
+
+ const ssef p21_diff = P_curve[1] - P_curve[0];
+ const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
+ const ssef dir = load4f(direction);
+ const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
+ const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
+#endif
+
+ float mr = max(r1, r2);
+ float l = len3(p21_diff);
+ float invl = 1.0f / l;
+ float sp_r = mr + 0.5f * l;
+
+ float sphere_b = dot3(dir, sphere_dif2);
+ float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
+
+ if(sdisc < 0.0f)
+ return false;
+
+ /* obtain parameters and test midpoint distance for suitable modes */
+#ifndef __KERNEL_SSE2__
+ float3 tg = p21_diff * invl;
+#else
+ const ssef tg = p21_diff * invl;
+#endif
+ float gd = (r2 - r1) * invl;
+
+ float dirz = dot3(dir, tg);
+ float difz = dot3(dif, tg);
+
+ float a = 1.0f - (dirz*dirz*(1 + gd*gd));
+
+ float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
+
+ float tcentre = -halfb/a;
+ float zcentre = difz + (dirz * tcentre);
+
+ if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
+ return false;
+ if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
+ return false;
+
+ /* test minimum separation */
+#ifndef __KERNEL_SSE2__
+ float3 cprod = cross(tg, dir);
+ float cprod2sq = len3_squared(cross(tg, dif));
+#else
+ const ssef cprod = cross(tg, dir);
+ float cprod2sq = len3_squared(cross_zxy(tg, dif));
+#endif
+ float cprodsq = len3_squared(cprod);
+ float distscaled = dot3(cprod, dif);
+
+ if(cprodsq == 0)
+ distscaled = cprod2sq;
+ else
+ distscaled = (distscaled*distscaled)/cprodsq;
+
+ if(distscaled > mr*mr)
+ return false;
+
+ /* calculate true intersection */
+#ifndef __KERNEL_SSE2__
+ float3 tdif = dif + tcentre * dir;
+#else
+ const ssef tdif = madd(ssef(tcentre), dir, dif);
+#endif
+ float tdifz = dot3(tdif, tg);
+ float tdifma = tdifz*gd + r1;
+ float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
+ float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
+ float td = tb*tb - 4*a*tc;
+
+ if(td < 0.0f)
+ return false;
+
+ float rootd = 0.0f;
+ float correction = 0.0f;
+ if(flags & CURVE_KN_ACCURATE) {
+ rootd = sqrtf(td);
+ correction = ((-tb - rootd)/(2*a));
+ }
+
+ float t = tcentre + correction;
+
+ if(t < isect->t) {
+
+ if(flags & CURVE_KN_INTERSECTCORRECTION) {
+ rootd = sqrtf(td);
+ correction = ((-tb - rootd)/(2*a));
+ t = tcentre + correction;
+ }
+
+ float z = zcentre + (dirz * correction);
+ // bool backface = false;
+
+ if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
+ // backface = true;
+ correction = ((-tb + rootd)/(2*a));
+ t = tcentre + correction;
+ z = zcentre + (dirz * correction);
+ }
+
+ /* stochastic fade from minimum width */
+ float adjradius = or1 + z * (or2 - or1) * invl;
+ adjradius = adjradius / (r1 + z * gd);
+ if(lcg_state && adjradius != 1.0f) {
+ if(lcg_step_float(lcg_state) > adjradius)
+ return false;
+ }
+ /* --- */
+
+ if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
+
+ if(flags & CURVE_KN_ENCLOSEFILTER) {
+ float enc_ratio = 1.01f;
+ if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
+ float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
+ float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
+ if(a2*c2 < 0.0f)
+ return false;
+ }
+ }
+
+#ifdef __VISIBILITY_FLAG__
+ /* visibility flag test. we do it here under the assumption
+ * that most triangles are culled by node flags */
+ if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+ {
+ /* record intersection */
+ isect->t = t;
+ isect->u = z*invl;
+ isect->v = gd;
+ isect->prim = curveAddr;
+ isect->object = object;
+ isect->type = type;
+
+ return true;
+ }
+ }
+ }
+
+ return false;
+
+#ifndef __KERNEL_SSE2__
+# undef len3_squared
+# undef len3
+# undef dot3
+#endif
+}
+
+ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+ float fc = 0.71f;
+ float data[4];
+ float t2 = t * t;
+ data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc;
+ data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t;
+ data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc;
+ data[3] = 3.0f * fc * t2 - 2.0f * fc * t;
+ return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+ float data[4];
+ float fc = 0.71f;
+ float t2 = t * t;
+ float t3 = t2 * t;
+ data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t;
+ data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f;
+ data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t;
+ data[3] = fc * t3 - fc * t2;
+ return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curve_refine(KernelGlobals *kg,
+ ShaderData *sd,
+ const Intersection *isect,
+ const Ray *ray)
+{
+ int flag = kernel_data.curve.curveflags;
+ float t = isect->t;
+ float3 P = ray->P;
+ float3 D = ray->D;
+
+ if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+ Transform tfm = sd->ob_itfm;
+#else
+ Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+ P = transform_point(&tfm, P);
+ D = transform_direction(&tfm, D*t);
+ D = normalize_len(D, &t);
+ }
+
+ int prim = kernel_tex_fetch(__prim_index, isect->prim);
+ float4 v00 = kernel_tex_fetch(__curves, prim);
+
+ int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+ int k1 = k0 + 1;
+
+ float3 tg;
+
+ if(flag & CURVE_KN_INTERPOLATE) {
+ int ka = max(k0 - 1,__float_as_int(v00.x));
+ int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+ float4 P_curve[4];
+
+ if(sd->type & PRIMITIVE_CURVE) {
+ P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+ P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+ P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+ P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+ }
+ else {
+ motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+ }
+
+ float3 p[4];
+ p[0] = float4_to_float3(P_curve[0]);
+ p[1] = float4_to_float3(P_curve[1]);
+ p[2] = float4_to_float3(P_curve[2]);
+ p[3] = float4_to_float3(P_curve[3]);
+
+ P = P + D*t;
+
+#ifdef __UV__
+ sd->u = isect->u;
+ sd->v = 0.0f;
+#endif
+
+ tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+
+ if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
+ sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+ }
+ else {
+ /* direction from inside to surface of curve */
+ float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
+ sd->Ng = normalize(P - p_curr);
+
+ /* adjustment for changing radius */
+ float gd = isect->v;
+
+ if(gd != 0.0f) {
+ sd->Ng = sd->Ng - gd * tg;
+ sd->Ng = normalize(sd->Ng);
+ }
+ }
+
+ /* todo: sometimes the normal is still so that this is detected as
+ * backfacing even if cull backfaces is enabled */
+
+ sd->N = sd->Ng;
+ }
+ else {
+ float4 P_curve[2];
+
+ if(sd->type & PRIMITIVE_CURVE) {
+ P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
+ P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+ }
+ else {
+ motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+ }
+
+ float l = 1.0f;
+ tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
+
+ P = P + D*t;
+
+ float3 dif = P - float4_to_float3(P_curve[0]);
+
+#ifdef __UV__
+ sd->u = dot(dif,tg)/l;
+ sd->v = 0.0f;
+#endif
+
+ if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
+ sd->Ng = -(D - tg * dot(tg, D));
+ sd->Ng = normalize(sd->Ng);
+ }
+ else {
+ float gd = isect->v;
+
+ /* direction from inside to surface of curve */
+ sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
+
+ /* adjustment for changing radius */
+ if(gd != 0.0f) {
+ sd->Ng = sd->Ng - gd * tg;
+ sd->Ng = normalize(sd->Ng);
+ }
+ }
+
+ sd->N = sd->Ng;
+ }
+
+#ifdef __DPDU__
+ /* dPdu/dPdv */
+ sd->dPdu = tg;
+ sd->dPdv = cross(tg, sd->Ng);
+#endif
+
+ if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+ Transform tfm = sd->ob_tfm;
+#else
+ Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+ P = transform_point(&tfm, P);
+ }
+
+ return P;
+}
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 6ecdfe0173a..1ffc143be34 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -415,12 +415,7 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir)
ccl_device_inline float3 bvh_inverse_direction(float3 dir)
{
- /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
return rcp(dir);
-#else
- return 1.0f / dir;
-#endif
}
/* Transform ray into object space to enter static object in BVH */
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 1e0ef5201c9..698cd6b03fd 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -35,10 +35,10 @@ ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
float4 r;
switch(id) {
case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break;
- case 1: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_001, x, y, z); break;
- case 2: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_002, x, y, z); break;
- case 3: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_003, x, y, z); break;
- case 4: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_004, x, y, z); break;
+ case 8: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_008, x, y, z); break;
+ case 16: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_016, x, y, z); break;
+ case 24: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_024, x, y, z); break;
+ case 32: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_032, x, y, z); break;
}
return r;
}
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 175bd6b9737..ae5f6e5e070 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -21,6 +21,9 @@ CCL_NAMESPACE_BEGIN
* BSDF evaluation result, split per BSDF type. This is used to accumulate
* render passes separately. */
+ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg,
+ const ShaderData *sd);
+
ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 value, int use_light_pass)
{
#ifdef __PASSES__
@@ -178,7 +181,6 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
if(use_light_pass) {
L->indirect = make_float3(0.0f, 0.0f, 0.0f);
- L->direct_throughput = make_float3(0.0f, 0.0f, 0.0f);
L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
L->color_diffuse = make_float3(0.0f, 0.0f, 0.0f);
@@ -199,57 +201,78 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
- L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
- L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
- L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
- L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
- L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
-
+ L->transparent = 0.0f;
L->emission = make_float3(0.0f, 0.0f, 0.0f);
L->background = make_float3(0.0f, 0.0f, 0.0f);
L->ao = make_float3(0.0f, 0.0f, 0.0f);
L->shadow = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
L->mist = 0.0f;
+
+ L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
+ L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
+ L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
+ L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
+ L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
+ L->state.direct = make_float3(0.0f, 0.0f, 0.0f);
}
else
#endif
{
+ L->transparent = 0.0f;
L->emission = make_float3(0.0f, 0.0f, 0.0f);
}
#ifdef __SHADOW_TRICKS__
L->path_total = make_float3(0.0f, 0.0f, 0.0f);
L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
- L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
+ L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f);
+ L->shadow_throughput = 0.0f;
+ L->shadow_transparency = 1.0f;
+ L->has_shadow_catcher = 0;
#endif
#ifdef __DENOISING_FEATURES__
L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
L->denoising_depth = 0.0f;
-#endif /* __DENOISING_FEATURES__ */
+#endif
+
+#ifdef __KERNEL_DEBUG__
+ L->debug_data.num_bvh_traversed_nodes = 0;
+ L->debug_data.num_bvh_traversed_instances = 0;
+ L->debug_data.num_bvh_intersections = 0;
+ L->debug_data.num_ray_bounces = 0;
+#endif
}
-ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
- BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
+ccl_device_inline void path_radiance_bsdf_bounce(
+ KernelGlobals *kg,
+ PathRadianceState *L_state,
+ ccl_addr_space float3 *throughput,
+ BsdfEval *bsdf_eval,
+ float bsdf_pdf, int bounce, int bsdf_label)
{
float inverse_pdf = 1.0f/bsdf_pdf;
#ifdef __PASSES__
- if(L->use_light_pass) {
+ if(kernel_data.film.use_light_pass) {
if(bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) {
/* first on directly visible surface */
float3 value = *throughput*inverse_pdf;
- L->path_diffuse = bsdf_eval->diffuse*value;
- L->path_glossy = bsdf_eval->glossy*value;
- L->path_transmission = bsdf_eval->transmission*value;
- L->path_subsurface = bsdf_eval->subsurface*value;
- L->path_scatter = bsdf_eval->scatter*value;
-
- *throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter;
+ L_state->diffuse = bsdf_eval->diffuse*value;
+ L_state->glossy = bsdf_eval->glossy*value;
+ L_state->transmission = bsdf_eval->transmission*value;
+ L_state->subsurface = bsdf_eval->subsurface*value;
+ L_state->scatter = bsdf_eval->scatter*value;
+
+ *throughput = L_state->diffuse +
+ L_state->glossy +
+ L_state->transmission +
+ L_state->subsurface +
+ L_state->scatter;
- L->direct_throughput = *throughput;
+ L_state->direct = *throughput;
}
else {
/* transparent bounce before first hit, or indirectly visible through BSDF */
@@ -264,13 +287,22 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space
}
}
-ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throughput, float3 value, int bounce)
+ccl_device_inline void path_radiance_accum_emission(PathRadiance *L,
+ ccl_addr_space PathState *state,
+ float3 throughput,
+ float3 value)
{
+#ifdef __SHADOW_TRICKS__
+ if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+ return;
+ }
+#endif
+
#ifdef __PASSES__
if(L->use_light_pass) {
- if(bounce == 0)
+ if(state->bounce == 0)
L->emission += throughput*value;
- else if(bounce == 1)
+ else if(state->bounce == 1)
L->direct_emission += throughput*value;
else
L->indirect += throughput*value;
@@ -289,6 +321,18 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
float3 bsdf,
float3 ao)
{
+#ifdef __SHADOW_TRICKS__
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ float3 light = throughput * bsdf;
+ L->path_total += light;
+ L->path_total_shaded += ao * light;
+
+ if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+ return;
+ }
+ }
+#endif
+
#ifdef __PASSES__
if(L->use_light_pass) {
if(state->bounce == 0) {
@@ -306,14 +350,6 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
{
L->emission += throughput*bsdf*ao;
}
-
-#ifdef __SHADOW_TRICKS__
- if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
- float3 light = throughput * bsdf;
- L->path_total += light;
- L->path_total_shaded += ao * light;
- }
-#endif
}
ccl_device_inline void path_radiance_accum_total_ao(
@@ -342,6 +378,18 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
float shadow_fac,
bool is_lamp)
{
+#ifdef __SHADOW_TRICKS__
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ float3 light = throughput * bsdf_eval->sum_no_mis;
+ L->path_total += light;
+ L->path_total_shaded += shadow * light;
+
+ if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+ return;
+ }
+ }
+#endif
+
#ifdef __PASSES__
if(L->use_light_pass) {
if(state->bounce == 0) {
@@ -368,14 +416,6 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
{
L->emission += throughput*bsdf_eval->diffuse*shadow;
}
-
-#ifdef __SHADOW_TRICKS__
- if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
- float3 light = throughput * bsdf_eval->sum_no_mis;
- L->path_total += light;
- L->path_total_shaded += shadow * light;
- }
-#endif
}
ccl_device_inline void path_radiance_accum_total_light(
@@ -396,11 +436,24 @@ ccl_device_inline void path_radiance_accum_total_light(
#endif
}
-ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
- ccl_addr_space PathState *state,
- float3 throughput,
- float3 value)
+ccl_device_inline void path_radiance_accum_background(
+ PathRadiance *L,
+ ccl_addr_space PathState *state,
+ float3 throughput,
+ float3 value)
{
+
+#ifdef __SHADOW_TRICKS__
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * value;
+ L->path_total_shaded += throughput * value * L->shadow_transparency;
+
+ if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+ return;
+ }
+ }
+#endif
+
#ifdef __PASSES__
if(L->use_light_pass) {
if(state->bounce == 0)
@@ -416,20 +469,31 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
L->emission += throughput*value;
}
-#ifdef __SHADOW_TRICKS__
- if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
- L->path_total += throughput * value;
- if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
- L->path_total_shaded += throughput * value;
- }
- }
-#endif
-
#ifdef __DENOISING_FEATURES__
L->denoising_albedo += state->denoising_feature_weight * value;
#endif /* __DENOISING_FEATURES__ */
}
+ccl_device_inline void path_radiance_accum_transparent(
+ PathRadiance *L,
+ ccl_addr_space PathState *state,
+ float3 throughput)
+{
+ L->transparent += average(throughput);
+}
+
+#ifdef __SHADOW_TRICKS__
+ccl_device_inline void path_radiance_accum_shadowcatcher(
+ PathRadiance *L,
+ float3 throughput,
+ float3 background)
+{
+ L->shadow_throughput += average(throughput);
+ L->shadow_background_color += throughput * background;
+ L->has_shadow_catcher = 1;
+}
+#endif
+
ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
{
#ifdef __PASSES__
@@ -437,19 +501,19 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
* only a single throughput further along the path, here we recover just
* the indirect path that is not influenced by any particular BSDF type */
if(L->use_light_pass) {
- L->direct_emission = safe_divide_color(L->direct_emission, L->direct_throughput);
- L->direct_diffuse += L->path_diffuse*L->direct_emission;
- L->direct_glossy += L->path_glossy*L->direct_emission;
- L->direct_transmission += L->path_transmission*L->direct_emission;
- L->direct_subsurface += L->path_subsurface*L->direct_emission;
- L->direct_scatter += L->path_scatter*L->direct_emission;
-
- L->indirect = safe_divide_color(L->indirect, L->direct_throughput);
- L->indirect_diffuse += L->path_diffuse*L->indirect;
- L->indirect_glossy += L->path_glossy*L->indirect;
- L->indirect_transmission += L->path_transmission*L->indirect;
- L->indirect_subsurface += L->path_subsurface*L->indirect;
- L->indirect_scatter += L->path_scatter*L->indirect;
+ L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct);
+ L->direct_diffuse += L->state.diffuse*L->direct_emission;
+ L->direct_glossy += L->state.glossy*L->direct_emission;
+ L->direct_transmission += L->state.transmission*L->direct_emission;
+ L->direct_subsurface += L->state.subsurface*L->direct_emission;
+ L->direct_scatter += L->state.scatter*L->direct_emission;
+
+ L->indirect = safe_divide_color(L->indirect, L->state.direct);
+ L->indirect_diffuse += L->state.diffuse*L->indirect;
+ L->indirect_glossy += L->state.glossy*L->indirect;
+ L->indirect_transmission += L->state.transmission*L->indirect;
+ L->indirect_subsurface += L->state.subsurface*L->indirect;
+ L->indirect_scatter += L->state.scatter*L->indirect;
}
#endif
}
@@ -458,11 +522,11 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
{
#ifdef __PASSES__
if(L->use_light_pass) {
- L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
- L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
- L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
- L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
- L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
+ L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
+ L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
+ L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
+ L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
+ L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
L->indirect = make_float3(0.0f, 0.0f, 0.0f);
@@ -475,11 +539,7 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L,
{
#ifdef __PASSES__
if(L->use_light_pass) {
- L->path_diffuse = L_src->path_diffuse;
- L->path_glossy = L_src->path_glossy;
- L->path_transmission = L_src->path_transmission;
- L->path_subsurface = L_src->path_subsurface;
- L->path_scatter = L_src->path_scatter;
+ L->state = L_src->state;
L->direct_emission = L_src->direct_emission;
L->indirect = L_src->indirect;
@@ -487,7 +547,36 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L,
#endif
}
-ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L)
+#ifdef __SHADOW_TRICKS__
+ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg,
+ PathRadiance *L,
+ float3 *L_sum,
+ float *alpha)
+{
+ /* Calculate current shadow of the path. */
+ float path_total = average(L->path_total);
+ float shadow;
+
+ if(path_total == 0.0f) {
+ shadow = L->shadow_transparency;
+ }
+ else {
+ float path_total_shaded = average(L->path_total_shaded);
+ shadow = path_total_shaded / path_total;
+ }
+
+ /* Calculate final light sum and transparency for shadow catcher object. */
+ if(kernel_data.background.transparent) {
+ *alpha -= L->shadow_throughput * shadow;
+ }
+ else {
+ L->shadow_background_color *= shadow;
+ *L_sum += L->shadow_background_color;
+ }
+}
+#endif
+
+ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L, float *alpha)
{
float3 L_sum;
/* Light Passes are used */
@@ -564,8 +653,6 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
L_sum = L_direct + L_indirect;
}
#endif
-
- return L_sum;
}
/* No Light Passes */
@@ -573,14 +660,24 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
#endif
{
L_sum = L->emission;
+
+ /* Reject invalid value */
+ float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
+ if(!isfinite_safe(sum)) {
+ kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
+ L_sum = make_float3(0.0f, 0.0f, 0.0f);
+ }
}
- /* Reject invalid value */
- float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
- if(!isfinite_safe(sum)) {
- kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
- L_sum = make_float3(0.0f, 0.0f, 0.0f);
+ /* Compute alpha. */
+ *alpha = 1.0f - L->transparent;
+
+ /* Add shadow catcher contributions. */
+#ifdef __SHADOW_TRICKS__
+ if(L->has_shadow_catcher) {
+ path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha);
}
+#endif /* __SHADOW_TRICKS__ */
return L_sum;
}
@@ -613,14 +710,18 @@ ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadi
*clean = make_float3(0.0f, 0.0f, 0.0f);
#endif
+#ifdef __SHADOW_TRICKS__
+ if(L->has_shadow_catcher) {
+ *noisy += L->shadow_background_color;
+ }
+#endif
+
*noisy = ensure_finite3(*noisy);
*clean = ensure_finite3(*clean);
}
-ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
+ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample)
{
- float fac = 1.0f/num_samples;
-
#ifdef __SPLIT_KERNEL__
# define safe_float3_add(f, v) \
do { \
@@ -629,65 +730,35 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
atomic_add_and_fetch_float(p+1, (v).y); \
atomic_add_and_fetch_float(p+2, (v).z); \
} while(0)
+# define safe_float_add(f, v) \
+ atomic_add_and_fetch_float(&(f), (v))
#else
# define safe_float3_add(f, v) (f) += (v)
+# define safe_float_add(f, v) (f) += (v)
#endif /* __SPLIT_KERNEL__ */
#ifdef __PASSES__
- safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac);
- safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac);
- safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac);
- safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac);
- safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac);
-
- safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac);
- safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac);
- safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac);
- safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac);
- safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac);
-
- safe_float3_add(L->background, L_sample->background*fac);
- safe_float3_add(L->ao, L_sample->ao*fac);
- safe_float3_add(L->shadow, L_sample->shadow*fac);
-# ifdef __SPLIT_KERNEL__
- atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac);
-# else
- L->mist += L_sample->mist*fac;
-# endif /* __SPLIT_KERNEL__ */
+ safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
+ safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
+ safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
+ safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface);
+ safe_float3_add(L->direct_scatter, L_sample->direct_scatter);
+
+ safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
+ safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
+ safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
+ safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface);
+ safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter);
+
+ safe_float3_add(L->background, L_sample->background);
+ safe_float3_add(L->ao, L_sample->ao);
+ safe_float3_add(L->shadow, L_sample->shadow);
+ safe_float_add(L->mist, L_sample->mist);
#endif /* __PASSES__ */
- safe_float3_add(L->emission, L_sample->emission*fac);
+ safe_float3_add(L->emission, L_sample->emission);
+#undef safe_float_add
#undef safe_float3_add
}
-#ifdef __SHADOW_TRICKS__
-/* Calculate current shadow of the path. */
-ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
-{
- float path_total = average(L->path_total);
- float path_total_shaded = average(L->path_total_shaded);
- if(path_total != 0.0f) {
- return path_total_shaded / path_total;
- }
- return 1.0f;
-}
-
-/* Calculate final light sum and transparency for shadow catcher object. */
-ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
- const PathRadiance *L,
- float* alpha)
-{
- const float shadow = path_radiance_sum_shadow(L);
- float3 L_sum;
- if(kernel_data.background.transparent) {
- *alpha = 1.0f-shadow;
- L_sum = make_float3(0.0f, 0.0f, 0.0f);
- }
- else {
- L_sum = L->shadow_color * shadow;
- }
- return L_sum;
-}
-#endif
-
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index f18d145f7cf..4d89839c46c 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline void compute_light_pass(KernelGlobals *kg,
ShaderData *sd,
PathRadiance *L,
- RNG rng,
+ uint rng_hash,
int pass_filter,
int sample)
{
@@ -48,11 +48,10 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
/* init path state */
- path_state_init(kg, &emission_sd, &state, &rng, sample, NULL);
+ path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL);
/* evaluate surface shader */
- float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
- shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
+ shader_eval_surface(kg, sd, &state, state.flag);
/* TODO, disable more closures we don't need besides transparent */
shader_bsdf_disable_transparency(kg, sd);
@@ -64,13 +63,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
/* sample ambient occlusion */
if(pass_filter & BAKE_FILTER_AO) {
- kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+ kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput, shader_bsdf_alpha(kg, sd));
}
/* sample emission */
if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
- path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+ path_radiance_accum_emission(&L_sample, &state, throughput, emission);
}
bool is_sss_sample = false;
@@ -86,7 +85,6 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
&emission_sd,
&L_sample,
&state,
- &rng,
&ray,
&throughput,
&ss_indirect))
@@ -101,13 +99,10 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
kernel_path_indirect(kg,
&indirect_sd,
&emission_sd,
- &rng,
&ray,
throughput,
- state.num_samples,
&state,
&L_sample);
- kernel_path_subsurface_accum_indirect(&ss_indirect, &L_sample);
}
is_sss_sample = true;
}
@@ -116,14 +111,14 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
/* sample light and BSDF */
if(!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
- kernel_path_surface_connect_light(kg, &rng, sd, &emission_sd, throughput, &state, &L_sample);
+ kernel_path_surface_connect_light(kg, sd, &emission_sd, throughput, &state, &L_sample);
- if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
+ if(kernel_path_surface_bounce(kg, sd, &throughput, &state, &L_sample.state, &ray)) {
#ifdef __LAMP_MIS__
state.ray_t = 0.0f;
#endif
/* compute indirect light */
- kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample);
+ kernel_path_indirect(kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample);
/* sum and reset indirect light pass variables for the next samples */
path_radiance_sum_indirect(&L_sample);
@@ -137,13 +132,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
/* sample ambient occlusion */
if(pass_filter & BAKE_FILTER_AO) {
- kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput);
+ kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput);
}
/* sample emission */
if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
- path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+ path_radiance_accum_emission(&L_sample, &state, throughput, emission);
}
#ifdef __SUBSURFACE__
@@ -151,7 +146,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) {
/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd,
- &emission_sd, &L_sample, &state, &rng, &ray, throughput);
+ &emission_sd, &L_sample, &state, &ray, throughput);
}
#endif
@@ -161,20 +156,20 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
/* direct light */
if(kernel_data.integrator.use_direct_light) {
int all = kernel_data.integrator.sample_all_lights_direct;
- kernel_branched_path_surface_connect_light(kg, &rng,
+ kernel_branched_path_surface_connect_light(kg,
sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all);
}
#endif
/* indirect light */
- kernel_branched_path_surface_indirect_light(kg, &rng,
+ kernel_branched_path_surface_indirect_light(kg,
sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample);
}
}
#endif
/* accumulate into master L */
- path_radiance_accum_sample(L, &L_sample, 1);
+ path_radiance_accum_sample(L, &L_sample);
}
ccl_device bool is_aa_pass(ShaderEvalType type)
@@ -225,7 +220,6 @@ ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
ShaderData *sd,
- RNG *rng,
PathState *state,
float3 direct,
float3 indirect,
@@ -245,12 +239,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
}
else {
/* surface color of the pass only */
- shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN);
+ shader_eval_surface(kg, sd, state, 0);
return kernel_bake_shader_bsdf(kg, sd, type);
}
}
else {
- shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN);
+ shader_eval_surface(kg, sd, state, 0);
color = kernel_bake_shader_bsdf(kg, sd, type);
}
@@ -292,14 +286,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
int num_samples = kernel_data.integrator.aa_samples;
/* random number generator */
- RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed);
+ uint rng_hash = cmj_hash(offset + i, kernel_data.integrator.seed);
float filter_x, filter_y;
if(sample == 0) {
filter_x = filter_y = 0.5f;
}
else {
- path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
+ path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
}
/* subpixel u/v offset */
@@ -335,18 +329,18 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
/* light passes if we need more than color */
if(pass_filter & ~BAKE_FILTER_COLOR)
- compute_light_pass(kg, &sd, &L, rng, pass_filter, sample);
+ compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
switch(type) {
/* data passes */
case SHADER_EVAL_NORMAL:
{
if((sd.flag & SD_HAS_BUMP)) {
- shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_MAIN);
+ shader_eval_surface(kg, &sd, &state, 0);
}
- /* compression: normal = (2 * color) - 1 */
- out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
+ /* encoding: normal = (2 * color) - 1 */
+ out = shader_bsdf_average_normal(kg, &sd) * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
break;
}
case SHADER_EVAL_UV:
@@ -356,7 +350,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
}
case SHADER_EVAL_EMISSION:
{
- shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_EMISSION);
+ shader_eval_surface(kg, &sd, &state, 0);
out = shader_emissive_eval(kg, &sd);
break;
}
@@ -371,7 +365,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
case SHADER_EVAL_COMBINED:
{
if((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) {
- out = path_radiance_clamp_and_sum(kg, &L);
+ float alpha;
+ out = path_radiance_clamp_and_sum(kg, &L, &alpha);
break;
}
@@ -409,7 +404,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
{
out = kernel_bake_evaluate_direct_indirect(kg,
&sd,
- &rng,
&state,
L.direct_diffuse,
L.indirect_diffuse,
@@ -421,7 +415,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
{
out = kernel_bake_evaluate_direct_indirect(kg,
&sd,
- &rng,
&state,
L.direct_glossy,
L.indirect_glossy,
@@ -433,7 +426,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
{
out = kernel_bake_evaluate_direct_indirect(kg,
&sd,
- &rng,
&state,
L.direct_transmission,
L.indirect_transmission,
@@ -446,7 +438,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
#ifdef __SUBSURFACE__
out = kernel_bake_evaluate_direct_indirect(kg,
&sd,
- &rng,
&state,
L.direct_subsurface,
L.indirect_subsurface,
@@ -480,7 +471,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
/* evaluate */
int flag = 0; /* we can't know which type of BSDF this is for */
- out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN);
+ out = shader_eval_background(kg, &sd, &state, flag);
break;
}
default:
@@ -524,7 +515,7 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
/* evaluate */
float3 P = sd.P;
- shader_eval_displacement(kg, &sd, &state, SHADER_CONTEXT_MAIN);
+ shader_eval_displacement(kg, &sd, &state);
out = sd.P - P;
object_inverse_dir_transform(kg, &sd, &out);
@@ -552,7 +543,7 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
/* evaluate */
int flag = 0; /* we can't know which type of BSDF this is for */
- out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN);
+ out = shader_eval_background(kg, &sd, &state, flag);
}
/* write output */
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 38708f7ff0b..1e2af9de8b3 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -38,11 +38,15 @@
/* Qualifier wrappers for different names on different devices */
#define ccl_device __device__ __inline__
+#if __CUDA_ARCH__ < 300
+# define ccl_device_inline __device__ __inline__
# define ccl_device_forceinline __device__ __forceinline__
-#if __CUDA_ARCH__ < 500
+#elif __CUDA_ARCH__ < 500
# define ccl_device_inline __device__ __forceinline__
+# define ccl_device_forceinline __device__ __forceinline__
#else
# define ccl_device_inline __device__ __inline__
+# define ccl_device_forceinline __device__ __forceinline__
#endif
#define ccl_device_noinline __device__ __noinline__
#define ccl_global
@@ -53,6 +57,10 @@
#define ccl_may_alias
#define ccl_addr_space
#define ccl_restrict __restrict__
+/* TODO(sergey): In theory we might use references with CUDA, however
+ * performance impact yet to be investigated.
+ */
+#define ccl_ref
#define ccl_align(n) __align__(n)
#define ATTR_FALLTHROUGH
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 4836c290312..36d6031d042 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -42,6 +42,7 @@
#define ccl_local_param __local
#define ccl_private __private
#define ccl_restrict restrict
+#define ccl_ref
#define ccl_align(n) __attribute__((aligned(n)))
#ifdef __SPLIT_KERNEL__
@@ -129,6 +130,7 @@
# define expf(x) native_exp(((float)(x)))
# define sqrtf(x) native_sqrt(((float)(x)))
# define logf(x) native_log(((float)(x)))
+# define rcp(x) native_recip(x)
#else
# define sinf(x) sin(((float)(x)))
# define cosf(x) cos(((float)(x)))
@@ -136,11 +138,12 @@
# define expf(x) exp(((float)(x)))
# define sqrtf(x) sqrt(((float)(x)))
# define logf(x) log(((float)(x)))
+# define rcp(x) recip(x))
#endif
/* data lookup defines */
#define kernel_data (*kg->data)
-#define kernel_tex_fetch(t, index) kg->t[index]
+#define kernel_tex_fetch(tex, index) ((ccl_global tex##_t*)(kg->buffers[kg->tex.buffer] + kg->tex.offset))[(index)]
/* define NULL */
#define NULL 0
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
deleted file mode 100644
index 5647bbae5b5..00000000000
--- a/intern/cycles/kernel/kernel_debug.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright 2011-2014 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void debug_data_init(DebugData *debug_data)
-{
- debug_data->num_bvh_traversed_nodes = 0;
- debug_data->num_bvh_traversed_instances = 0;
- debug_data->num_bvh_intersections = 0;
- debug_data->num_ray_bounces = 0;
-}
-
-ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
- ccl_global float *buffer,
- ccl_addr_space PathState *state,
- DebugData *debug_data,
- int sample)
-{
- int flag = kernel_data.film.pass_flag;
- if(flag & PASS_BVH_TRAVERSED_NODES) {
- kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes,
- sample,
- debug_data->num_bvh_traversed_nodes);
- }
- if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
- kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
- sample,
- debug_data->num_bvh_traversed_instances);
- }
- if(flag & PASS_BVH_INTERSECTIONS) {
- kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections,
- sample,
- debug_data->num_bvh_intersections);
- }
- if(flag & PASS_RAY_BOUNCES) {
- kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
- sample,
- debug_data->num_ray_bounces);
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 9e7d51f23f5..45b8c6311e1 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -37,16 +37,14 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
ray.D = ls->D;
ray.P = ls->P;
ray.t = 1.0f;
-# ifdef __OBJECT_MOTION__
ray.time = time;
-# endif
ray.dP = differential3_zero();
ray.dD = dI;
shader_setup_from_background(kg, emission_sd, &ray);
path_state_modify_bounce(state, true);
- eval = shader_eval_background(kg, emission_sd, state, 0, SHADER_CONTEXT_EMISSION);
+ eval = shader_eval_background(kg, emission_sd, state, 0);
path_state_modify_bounce(state, false);
}
else
@@ -72,7 +70,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
/* no path flag, we're evaluating this for all closures. that's weak but
* we'd have to do multiple evaluations otherwise */
path_state_modify_bounce(state, true);
- shader_eval_surface(kg, emission_sd, NULL, state, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+ shader_eval_surface(kg, emission_sd, state, 0);
path_state_modify_bounce(state, false);
/* evaluate emissive closure */
@@ -216,7 +214,7 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
{
/* multiple importance sampling, get triangle light pdf,
* and compute weight with respect to BSDF pdf */
- float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
+ float pdf = triangle_light_pdf(kg, sd, t);
float mis_weight = power_heuristic(bsdf_pdf, pdf);
return L*mis_weight;
@@ -319,7 +317,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
# endif
path_state_modify_bounce(state, true);
- float3 L = shader_eval_background(kg, emission_sd, state, state->flag, SHADER_CONTEXT_EMISSION);
+ float3 L = shader_eval_background(kg, emission_sd, state, state->flag);
path_state_modify_bounce(state, false);
#ifdef __BACKGROUND_MIS__
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index f95f0d98c52..9d55183d94b 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -23,6 +23,10 @@
# include "util/util_vector.h"
#endif
+#ifdef __KERNEL_OPENCL__
+# include "util/util_atomic.h"
+#endif
+
CCL_NAMESPACE_BEGIN
/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -109,11 +113,22 @@ typedef struct KernelGlobals {
#ifdef __KERNEL_OPENCL__
+# define KERNEL_TEX(type, ttype, name) \
+typedef type name##_t;
+# include "kernel/kernel_textures.h"
+
+typedef struct tex_info_t {
+ uint buffer, padding;
+ uint64_t offset;
+ uint width, height, depth, options;
+} tex_info_t;
+
typedef ccl_addr_space struct KernelGlobals {
ccl_constant KernelData *data;
+ ccl_global char *buffers[8];
# define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name;
+ tex_info_t name;
# include "kernel/kernel_textures.h"
# ifdef __SPLIT_KERNEL__
@@ -122,6 +137,57 @@ typedef ccl_addr_space struct KernelGlobals {
# endif
} KernelGlobals;
+#define KERNEL_BUFFER_PARAMS \
+ ccl_global char *buffer0, \
+ ccl_global char *buffer1, \
+ ccl_global char *buffer2, \
+ ccl_global char *buffer3, \
+ ccl_global char *buffer4, \
+ ccl_global char *buffer5, \
+ ccl_global char *buffer6, \
+ ccl_global char *buffer7
+
+#define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
+
+ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
+{
+#ifdef __SPLIT_KERNEL__
+ if(ccl_local_id(0) + ccl_local_id(1) == 0)
+#endif
+ {
+ kg->buffers[0] = buffer0;
+ kg->buffers[1] = buffer1;
+ kg->buffers[2] = buffer2;
+ kg->buffers[3] = buffer3;
+ kg->buffers[4] = buffer4;
+ kg->buffers[5] = buffer5;
+ kg->buffers[6] = buffer6;
+ kg->buffers[7] = buffer7;
+ }
+
+# ifdef __SPLIT_KERNEL__
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+# endif
+}
+
+ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
+{
+# ifdef __SPLIT_KERNEL__
+ if(ccl_local_id(0) + ccl_local_id(1) == 0)
+# endif
+ {
+ ccl_global tex_info_t *info = (ccl_global tex_info_t*)kg->buffers[0];
+
+# define KERNEL_TEX(type, ttype, name) \
+ kg->name = *(info++);
+# include "kernel/kernel_textures.h"
+ }
+
+# ifdef __SPLIT_KERNEL__
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+# endif
+}
+
#endif /* __KERNEL_OPENCL__ */
/* Interpolated lookup table access */
diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h
index 90747e09357..9e3373432ec 100644
--- a/intern/cycles/kernel/kernel_image_opencl.h
+++ b/intern/cycles/kernel/kernel_image_opencl.h
@@ -15,30 +15,42 @@
*/
-/* For OpenCL all images are packed in a single array, and we do manual lookup
- * and interpolation. */
+/* For OpenCL we do manual lookup and interpolation. */
+
+ccl_device_inline ccl_global tex_info_t* kernel_tex_info(KernelGlobals *kg, uint id) {
+ const uint tex_offset = id
+#define KERNEL_TEX(type, ttype, name) + 1
+#include "kernel/kernel_textures.h"
+ ;
+
+ return &((ccl_global tex_info_t*)kg->buffers[0])[tex_offset];
+}
+
+#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->buffer] + info->offset))[(index)]
ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
{
+ const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
const int texture_type = kernel_tex_type(id);
+
/* Float4 */
if(texture_type == IMAGE_DATA_TYPE_FLOAT4) {
- return kernel_tex_fetch(__tex_image_float4_packed, offset);
+ return tex_fetch(float4, info, offset);
}
/* Byte4 */
else if(texture_type == IMAGE_DATA_TYPE_BYTE4) {
- uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
+ uchar4 r = tex_fetch(uchar4, info, offset);
float f = 1.0f/255.0f;
return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
}
/* Float */
else if(texture_type == IMAGE_DATA_TYPE_FLOAT) {
- float f = kernel_tex_fetch(__tex_image_float_packed, offset);
+ float f = tex_fetch(float, info, offset);
return make_float4(f, f, f, 1.0f);
}
/* Byte */
else {
- uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset);
+ uchar r = tex_fetch(uchar, info, offset);
float f = r * (1.0f/255.0f);
return make_float4(f, f, f, 1.0f);
}
@@ -64,17 +76,17 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
return x - (float)i;
}
-ccl_device_inline uint kernel_decode_image_interpolation(uint4 info)
+ccl_device_inline uint kernel_decode_image_interpolation(uint info)
{
- return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
+ return (info & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
}
-ccl_device_inline uint kernel_decode_image_extension(uint4 info)
+ccl_device_inline uint kernel_decode_image_extension(uint info)
{
- if(info.w & (1 << 1)) {
+ if(info & (1 << 1)) {
return EXTENSION_REPEAT;
}
- else if(info.w & (1 << 2)) {
+ else if(info & (1 << 2)) {
return EXTENSION_EXTEND;
}
else {
@@ -84,13 +96,16 @@ ccl_device_inline uint kernel_decode_image_extension(uint4 info)
ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
{
- uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
- uint width = info.x;
- uint height = info.y;
- uint offset = info.z;
+ const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
+
+ uint width = info->width;
+ uint height = info->height;
+ uint offset = 0;
+
/* Decode image options. */
- uint interpolation = kernel_decode_image_interpolation(info);
- uint extension = kernel_decode_image_extension(info);
+ uint interpolation = kernel_decode_image_interpolation(info->options);
+ uint extension = kernel_decode_image_extension(info->options);
+
/* Actual sampling. */
float4 r;
int ix, iy, nix, niy;
@@ -150,14 +165,17 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z)
{
- uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
- uint width = info.x;
- uint height = info.y;
- uint offset = info.z;
- uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x;
+ const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
+
+ uint width = info->width;
+ uint height = info->height;
+ uint offset = 0;
+ uint depth = info->depth;
+
/* Decode image options. */
- uint interpolation = kernel_decode_image_interpolation(info);
- uint extension = kernel_decode_image_extension(info);
+ uint interpolation = kernel_decode_image_interpolation(info->options);
+ uint extension = kernel_decode_image_extension(info->options);
+
/* Actual sampling. */
float4 r;
int ix, iy, iz, nix, niy, niz;
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 9baa9d54957..c806deee8e7 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -396,11 +396,13 @@ ccl_device_inline float3 background_light_sample(KernelGlobals *kg,
+ (1.0f - portal_sampling_pdf) * cdf_pdf);
}
return D;
- } else {
+ }
+ else {
/* Sample map, but with nonzero portal_sampling_pdf for MIS. */
randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf);
}
- } else {
+ }
+ else {
/* We can't sample a portal.
* Check if we can sample the map instead.
*/
@@ -763,78 +765,280 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
/* Triangle Light */
-ccl_device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls, int object, float time)
+/* returns true if the triangle is has motion blur or an instancing transform applied */
+ccl_device_inline bool triangle_world_space_vertices(KernelGlobals *kg, int object, int prim, float time, float3 V[3])
{
+ bool has_motion = false;
+ const int object_flag = kernel_tex_fetch(__object_flag, object);
+
+ if(object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) {
+ motion_triangle_vertices(kg, object, prim, time, V);
+ has_motion = true;
+ }
+ else {
+ triangle_vertices(kg, prim, V);
+ }
+
#ifdef __INSTANCING__
- /* instance transform */
- if(!(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED)) {
+ if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
# ifdef __OBJECT_MOTION__
- Transform itfm;
- Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm);
+ Transform tfm = object_fetch_transform_motion_test(kg, object, time, NULL);
# else
Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
# endif
-
- ls->P = transform_point(&tfm, ls->P);
- ls->Ng = normalize(transform_direction(&tfm, ls->Ng));
+ V[0] = transform_point(&tfm, V[0]);
+ V[1] = transform_point(&tfm, V[1]);
+ V[2] = transform_point(&tfm, V[2]);
+ has_motion = true;
}
#endif
+ return has_motion;
}
-ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
- float randu, float randv, float time, LightSample *ls)
+ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
{
- float u, v;
+ float pdf = kernel_data.integrator.pdf_triangles;
+ float cos_pi = fabsf(dot(Ng, I));
- /* compute random point in triangle */
- randu = sqrtf(randu);
+ if(cos_pi == 0.0f)
+ return 0.0f;
+
+ return t*t*pdf/cos_pi;
+}
- u = 1.0f - randu;
- v = randv*randu;
+ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t)
+{
+ /* A naive heuristic to decide between costly solid angle sampling
+ * and simple area sampling, comparing the distance to the triangle plane
+ * to the length of the edges of the triangle. */
+
+ float3 V[3];
+ bool has_motion = triangle_world_space_vertices(kg, sd->object, sd->prim, sd->time, V);
+
+ const float3 e0 = V[1] - V[0];
+ const float3 e1 = V[2] - V[0];
+ const float3 e2 = V[2] - V[1];
+ const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
+ const float3 N = cross(e0, e1);
+ const float distance_to_plane = fabsf(dot(N, sd->I * t))/dot(N, N);
+
+ if(longest_edge_squared > distance_to_plane*distance_to_plane) {
+ /* sd contains the point on the light source
+ * calculate Px, the point that we're shading */
+ const float3 Px = sd->P + sd->I * t;
+ const float3 v0_p = V[0] - Px;
+ const float3 v1_p = V[1] - Px;
+ const float3 v2_p = V[2] - Px;
+
+ const float3 u01 = safe_normalize(cross(v0_p, v1_p));
+ const float3 u02 = safe_normalize(cross(v0_p, v2_p));
+ const float3 u12 = safe_normalize(cross(v1_p, v2_p));
+
+ const float alpha = fast_acosf(dot(u02, u01));
+ const float beta = fast_acosf(-dot(u01, u12));
+ const float gamma = fast_acosf(dot(u02, u12));
+ const float solid_angle = alpha + beta + gamma - M_PI_F;
+
+ /* pdf_triangles is calculated over triangle area, but we're not sampling over its area */
+ if(UNLIKELY(solid_angle == 0.0f)) {
+ return 0.0f;
+ }
+ else {
+ float area = 1.0f;
+ if(has_motion) {
+ /* get the center frame vertices, this is what the PDF was calculated from */
+ triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
+ area = triangle_area(V[0], V[1], V[2]);
+ }
+ else {
+ area = 0.5f * len(N);
+ }
+ const float pdf = area * kernel_data.integrator.pdf_triangles;
+ return pdf / solid_angle;
+ }
+ }
+ else {
+ float pdf = triangle_light_pdf_area(kg, sd->Ng, sd->I, t);
+ if(has_motion) {
+ const float area = 0.5f * len(N);
+ if(UNLIKELY(area == 0.0f)) {
+ return 0.0f;
+ }
+ /* scale the PDF.
+ * area = the area the sample was taken from
+ * area_pre = the are from which pdf_triangles was calculated from */
+ triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
+ const float area_pre = triangle_area(V[0], V[1], V[2]);
+ pdf = pdf * area_pre / area;
+ }
+ return pdf;
+ }
+}
- /* triangle, so get position, normal, shader */
- triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
+ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, int prim, int object,
+ float randu, float randv, float time, LightSample *ls, const float3 P)
+{
+ /* A naive heuristic to decide between costly solid angle sampling
+ * and simple area sampling, comparing the distance to the triangle plane
+ * to the length of the edges of the triangle. */
+
+ float3 V[3];
+ bool has_motion = triangle_world_space_vertices(kg, object, prim, time, V);
+
+ const float3 e0 = V[1] - V[0];
+ const float3 e1 = V[2] - V[0];
+ const float3 e2 = V[2] - V[1];
+ const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
+ const float3 N0 = cross(e0, e1);
+ float Nl = 0.0f;
+ ls->Ng = safe_normalize_len(N0, &Nl);
+ float area = 0.5f * Nl;
+
+ /* flip normal if necessary */
+ const int object_flag = kernel_tex_fetch(__object_flag, object);
+ if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+ ls->Ng = -ls->Ng;
+ }
+ ls->eval_fac = 1.0f;
+ ls->shader = kernel_tex_fetch(__tri_shader, prim);
ls->object = object;
ls->prim = prim;
ls->lamp = LAMP_NONE;
ls->shader |= SHADER_USE_MIS;
- ls->t = 0.0f;
- ls->u = u;
- ls->v = v;
ls->type = LIGHT_TRIANGLE;
- ls->eval_fac = 1.0f;
- object_transform_light_sample(kg, ls, object, time);
-}
+ float distance_to_plane = fabsf(dot(N0, V[0] - P)/dot(N0, N0));
+
+ if(longest_edge_squared > distance_to_plane*distance_to_plane) {
+ /* see James Arvo, "Stratified Sampling of Spherical Triangles"
+ * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
+
+ /* project the triangle to the unit sphere
+ * and calculate its edges and angles */
+ const float3 v0_p = V[0] - P;
+ const float3 v1_p = V[1] - P;
+ const float3 v2_p = V[2] - P;
+
+ const float3 u01 = safe_normalize(cross(v0_p, v1_p));
+ const float3 u02 = safe_normalize(cross(v0_p, v2_p));
+ const float3 u12 = safe_normalize(cross(v1_p, v2_p));
+
+ const float3 A = safe_normalize(v0_p);
+ const float3 B = safe_normalize(v1_p);
+ const float3 C = safe_normalize(v2_p);
+
+ const float cos_alpha = dot(u02, u01);
+ const float cos_beta = -dot(u01, u12);
+ const float cos_gamma = dot(u02, u12);
+
+ /* calculate dihedral angles */
+ const float alpha = fast_acosf(cos_alpha);
+ const float beta = fast_acosf(cos_beta);
+ const float gamma = fast_acosf(cos_gamma);
+ /* the area of the unit spherical triangle = solid angle */
+ const float solid_angle = alpha + beta + gamma - M_PI_F;
+
+ /* precompute a few things
+ * these could be re-used to take several samples
+ * as they are independent of randu/randv */
+ const float cos_c = dot(A, B);
+ const float sin_alpha = fast_sinf(alpha);
+ const float product = sin_alpha * cos_c;
+
+ /* Select a random sub-area of the spherical triangle
+ * and calculate the third vertex C_ of that new triangle */
+ const float phi = randu * solid_angle - alpha;
+ float s, t;
+ fast_sincosf(phi, &s, &t);
+ const float u = t - cos_alpha;
+ const float v = s + product;
+
+ const float3 U = safe_normalize(C - dot(C, A) * A);
+
+ float q = 1.0f;
+ const float det = ((v * s + u * t) * sin_alpha);
+ if(det != 0.0f) {
+ q = ((v * t - u * s) * cos_alpha - v) / det;
+ }
+ const float temp = max(1.0f - q*q, 0.0f);
-ccl_device float triangle_light_pdf(KernelGlobals *kg,
- const float3 Ng, const float3 I, float t)
-{
- float pdf = kernel_data.integrator.pdf_triangles;
- float cos_pi = fabsf(dot(Ng, I));
+ const float3 C_ = safe_normalize(q * A + sqrtf(temp) * U);
- if(cos_pi == 0.0f)
- return 0.0f;
-
- return t*t*pdf/cos_pi;
+ /* Finally, select a random point along the edge of the new triangle
+ * That point on the spherical triangle is the sampled ray direction */
+ const float z = 1.0f - randv * (1.0f - dot(C_, B));
+ ls->D = z * B + safe_sqrtf(1.0f - z*z) * safe_normalize(C_ - dot(C_, B) * B);
+
+ /* calculate intersection with the planar triangle */
+ if(!ray_triangle_intersect(P, ls->D, FLT_MAX,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+ (ssef*)V,
+#else
+ V[0], V[1], V[2],
+#endif
+ &ls->u, &ls->v, &ls->t)) {
+ ls->pdf = 0.0f;
+ return;
+ }
+
+ ls->P = P + ls->D * ls->t;
+
+ /* pdf_triangles is calculated over triangle area, but we're sampling over solid angle */
+ if(UNLIKELY(solid_angle == 0.0f)) {
+ ls->pdf = 0.0f;
+ return;
+ }
+ else {
+ if(has_motion) {
+ /* get the center frame vertices, this is what the PDF was calculated from */
+ triangle_world_space_vertices(kg, object, prim, -1.0f, V);
+ area = triangle_area(V[0], V[1], V[2]);
+ }
+ const float pdf = area * kernel_data.integrator.pdf_triangles;
+ ls->pdf = pdf / solid_angle;
+ }
+ }
+ else {
+ /* compute random point in triangle */
+ randu = sqrtf(randu);
+
+ const float u = 1.0f - randu;
+ const float v = randv*randu;
+ const float t = 1.0f - u - v;
+ ls->P = u * V[0] + v * V[1] + t * V[2];
+ /* compute incoming direction, distance and pdf */
+ ls->D = normalize_len(ls->P - P, &ls->t);
+ ls->pdf = triangle_light_pdf_area(kg, ls->Ng, -ls->D, ls->t);
+ if(has_motion && area != 0.0f) {
+ /* scale the PDF.
+ * area = the area the sample was taken from
+ * area_pre = the are from which pdf_triangles was calculated from */
+ triangle_world_space_vertices(kg, object, prim, -1.0f, V);
+ const float area_pre = triangle_area(V[0], V[1], V[2]);
+ ls->pdf = ls->pdf * area_pre / area;
+ }
+ ls->u = u;
+ ls->v = v;
+ }
}
/* Light Distribution */
-ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
+ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
{
- /* this is basically std::upper_bound as used by pbrt, to find a point light or
+ /* This is basically std::upper_bound as used by pbrt, to find a point light or
* triangle to emit from, proportional to area. a good improvement would be to
* also sample proportional to power, though it's not so well defined with
- * OSL shaders. */
+ * arbitrary shaders. */
int first = 0;
int len = kernel_data.integrator.num_distribution + 1;
+ float r = *randu;
while(len > 0) {
int half_len = len >> 1;
int middle = first + half_len;
- if(randt < kernel_tex_fetch(__light_distribution, middle).x) {
+ if(r < kernel_tex_fetch(__light_distribution, middle).x) {
len = half_len;
}
else {
@@ -843,9 +1047,17 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
}
}
- /* clamping should not be needed but float rounding errors seem to
- * make this fail on rare occasions */
- return clamp(first-1, 0, kernel_data.integrator.num_distribution-1);
+ /* Clamping should not be needed but float rounding errors seem to
+ * make this fail on rare occasions. */
+ int index = clamp(first-1, 0, kernel_data.integrator.num_distribution-1);
+
+ /* Rescale to reuse random number. this helps the 2D samples within
+ * each area light be stratified as well. */
+ float distr_min = kernel_tex_fetch(__light_distribution, index).x;
+ float distr_max = kernel_tex_fetch(__light_distribution, index+1).x;
+ *randu = (r - distr_min)/(distr_max - distr_min);
+
+ return index;
}
/* Generic Light */
@@ -857,7 +1069,6 @@ ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, i
}
ccl_device_noinline bool light_sample(KernelGlobals *kg,
- float randt,
float randu,
float randv,
float time,
@@ -866,7 +1077,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg,
LightSample *ls)
{
/* sample index */
- int index = light_distribution_sample(kg, randt);
+ int index = light_distribution_sample(kg, &randu);
/* fetch light data */
float4 l = kernel_tex_fetch(__light_distribution, index);
@@ -876,10 +1087,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg,
int object = __float_as_int(l.w);
int shader_flag = __float_as_int(l.z);
- triangle_light_sample(kg, prim, object, randu, randv, time, ls);
- /* compute incoming direction, distance and pdf */
- ls->D = normalize_len(ls->P - P, &ls->t);
- ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t);
+ triangle_light_sample(kg, prim, object, randu, randv, time, ls, P);
ls->shader |= shader_flag;
return (ls->pdf > 0.0f);
}
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 9cd7ffb181d..fff7f4cfdb7 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -194,8 +194,38 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
#endif /* __DENOISING_FEATURES__ */
}
+#ifdef __KERNEL_DEBUG__
+ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
+ ccl_global float *buffer,
+ PathRadiance *L,
+ int sample)
+{
+ int flag = kernel_data.film.pass_flag;
+ if(flag & PASS_BVH_TRAVERSED_NODES) {
+ kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes,
+ sample,
+ L->debug_data.num_bvh_traversed_nodes);
+ }
+ if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
+ kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
+ sample,
+ L->debug_data.num_bvh_traversed_instances);
+ }
+ if(flag & PASS_BVH_INTERSECTIONS) {
+ kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections,
+ sample,
+ L->debug_data.num_bvh_intersections);
+ }
+ if(flag & PASS_RAY_BOUNCES) {
+ kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
+ sample,
+ L->debug_data.num_ray_bounces);
+ }
+}
+#endif /* __KERNEL_DEBUG__ */
+
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
- ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
+ ShaderData *sd, ccl_addr_space PathState *state, float3 throughput)
{
#ifdef __PASSES__
int path_flag = state->flag;
@@ -213,6 +243,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
kernel_data.film.pass_alpha_threshold == 0.0f ||
average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
{
+ int sample = state->sample;
if(sample == 0) {
if(flag & PASS_DEPTH) {
@@ -230,7 +261,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
}
if(flag & PASS_NORMAL) {
- float3 normal = sd->N;
+ float3 normal = shader_bsdf_average_normal(kg, sd);
kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
}
if(flag & PASS_UV) {
@@ -334,19 +365,11 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
}
ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
- int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
+ int sample, PathRadiance *L)
{
if(L) {
- float3 L_sum;
-#ifdef __SHADOW_TRICKS__
- if(is_shadow_catcher) {
- L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
- }
- else
-#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, L);
- }
+ float alpha;
+ float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
@@ -361,16 +384,7 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *
# endif
if(kernel_data.film.pass_denoising_clean) {
float3 noisy, clean;
-#ifdef __SHADOW_TRICKS__
- if(is_shadow_catcher) {
- noisy = L_sum;
- clean = make_float3(0.0f, 0.0f, 0.0f);
- }
- else
-#endif /* __SHADOW_TRICKS__ */
- {
- path_radiance_split_denoising(kg, L, &noisy, &clean);
- }
+ path_radiance_split_denoising(kg, L, &noisy, &clean);
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
sample, noisy);
kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
@@ -389,6 +403,11 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *
sample, L->denoising_depth);
}
#endif /* __DENOISING_FEATURES__ */
+
+
+#ifdef __KERNEL_DEBUG__
+ kernel_write_debug_passes(kg, buffer, L, sample);
+#endif
}
else {
kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index fc093ad8319..793fede0deb 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -48,25 +48,308 @@
#include "kernel/kernel_path_volume.h"
#include "kernel/kernel_path_subsurface.h"
+CCL_NAMESPACE_BEGIN
+
+ccl_device_forceinline bool kernel_path_scene_intersect(
+ KernelGlobals *kg,
+ ccl_addr_space PathState *state,
+ Ray *ray,
+ Intersection *isect,
+ PathRadiance *L)
+{
+ uint visibility = path_state_ray_visibility(kg, state);
+
+#ifdef __HAIR__
+ float difl = 0.0f, extmax = 0.0f;
+ uint lcg_state = 0;
+
+ if(kernel_data.bvh.have_curves) {
+ if((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) {
+ float3 pixdiff = ray->dD.dx + ray->dD.dy;
+ /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+ difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+ }
+
+ extmax = kernel_data.curve.maximum_width;
+ lcg_state = lcg_state_init_addrspace(state, 0x51633e2d);
+ }
+
+ if(path_state_ao_bounce(kg, state)) {
+ visibility = PATH_RAY_SHADOW;
+ ray->t = kernel_data.background.ao_distance;
+ }
+
+ bool hit = scene_intersect(kg, *ray, visibility, isect, &lcg_state, difl, extmax);
+#else
+ bool hit = scene_intersect(kg, *ray, visibility, isect, NULL, 0.0f, 0.0f);
+#endif /* __HAIR__ */
+
#ifdef __KERNEL_DEBUG__
-# include "kernel/kernel_debug.h"
-#endif
+ if(state->flag & PATH_RAY_CAMERA) {
+ L->debug_data.num_bvh_traversed_nodes += isect->num_traversed_nodes;
+ L->debug_data.num_bvh_traversed_instances += isect->num_traversed_instances;
+ L->debug_data.num_bvh_intersections += isect->num_intersections;
+ }
+ L->debug_data.num_ray_bounces++;
+#endif /* __KERNEL_DEBUG__ */
-CCL_NAMESPACE_BEGIN
+ return hit;
+}
+
+ccl_device_forceinline void kernel_path_lamp_emission(
+ KernelGlobals *kg,
+ ccl_addr_space PathState *state,
+ Ray *ray,
+ float3 throughput,
+ ccl_addr_space Intersection *isect,
+ ShaderData *emission_sd,
+ PathRadiance *L)
+{
+#ifdef __LAMP_MIS__
+ if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
+ /* ray starting from previous non-transparent bounce */
+ Ray light_ray;
+
+ light_ray.P = ray->P - state->ray_t*ray->D;
+ state->ray_t += isect->t;
+ light_ray.D = ray->D;
+ light_ray.t = state->ray_t;
+ light_ray.time = ray->time;
+ light_ray.dD = ray->dD;
+ light_ray.dP = ray->dP;
+
+ /* intersect with lamp */
+ float3 emission;
+
+ if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission))
+ path_radiance_accum_emission(L, state, throughput, emission);
+ }
+#endif /* __LAMP_MIS__ */
+}
+
+ccl_device_forceinline void kernel_path_background(
+ KernelGlobals *kg,
+ ccl_addr_space PathState *state,
+ ccl_addr_space Ray *ray,
+ float3 throughput,
+ ShaderData *emission_sd,
+ PathRadiance *L)
+{
+ /* eval background shader if nothing hit */
+ if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+ L->transparent += average(throughput);
+
+#ifdef __PASSES__
+ if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif /* __PASSES__ */
+ return;
+ }
+
+#ifdef __BACKGROUND__
+ /* sample background shader */
+ float3 L_background = indirect_background(kg, emission_sd, state, ray);
+ path_radiance_accum_background(L, state, throughput, L_background);
+#endif /* __BACKGROUND__ */
+}
+
+#ifndef __SPLIT_KERNEL__
+
+#ifdef __VOLUME__
+ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
+ KernelGlobals *kg,
+ ShaderData *sd,
+ PathState *state,
+ Ray *ray,
+ float3 *throughput,
+ ccl_addr_space Intersection *isect,
+ bool hit,
+ ShaderData *emission_sd,
+ PathRadiance *L)
+{
+ /* Sanitize volume stack. */
+ if(!hit) {
+ kernel_volume_clean_stack(kg, state->volume_stack);
+ }
+ /* volume attenuation, emission, scatter */
+ if(state->volume_stack[0].shader != SHADER_NONE) {
+ Ray volume_ray = *ray;
+ volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+# ifdef __VOLUME_DECOUPLED__
+ int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
+ bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
+ bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method);
+
+ if(decoupled) {
+ /* cache steps along volume for repeated sampling */
+ VolumeSegment volume_segment;
+
+ shader_setup_from_volume(kg, sd, &volume_ray);
+ kernel_volume_decoupled_record(kg, state,
+ &volume_ray, sd, &volume_segment, heterogeneous);
+
+ volume_segment.sampling_method = sampling_method;
+
+ /* emission */
+ if(volume_segment.closure_flag & SD_EMISSION)
+ path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+
+ /* scattering */
+ VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+ if(volume_segment.closure_flag & SD_SCATTER) {
+ int all = kernel_data.integrator.sample_all_lights_indirect;
+
+ /* direct light sampling */
+ kernel_branched_path_volume_connect_light(kg, sd,
+ emission_sd, *throughput, state, L, all,
+ &volume_ray, &volume_segment);
+
+ /* indirect sample. if we use distance sampling and take just
+ * one sample for direct and indirect light, we could share
+ * this computation, but makes code a bit complex */
+ float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+ float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+
+ result = kernel_volume_decoupled_scatter(kg,
+ state, &volume_ray, sd, throughput,
+ rphase, rscatter, &volume_segment, NULL, true);
+ }
+
+ /* free cached steps */
+ kernel_volume_decoupled_free(kg, &volume_segment);
+
+ if(result == VOLUME_PATH_SCATTERED) {
+ if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
+ return VOLUME_PATH_SCATTERED;
+ else
+ return VOLUME_PATH_MISSED;
+ }
+ else {
+ *throughput *= volume_segment.accum_transmittance;
+ }
+ }
+ else
+# endif /* __VOLUME_DECOUPLED__ */
+ {
+ /* integrate along volume segment with distance sampling */
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+
+# ifdef __VOLUME_SCATTER__
+ if(result == VOLUME_PATH_SCATTERED) {
+ /* direct lighting */
+ kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
+ /* indirect light bounce */
+ if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
+ return VOLUME_PATH_SCATTERED;
+ else
+ return VOLUME_PATH_MISSED;
+ }
+# endif /* __VOLUME_SCATTER__ */
+ }
+ }
+
+ return VOLUME_PATH_ATTENUATED;
+}
+#endif /* __VOLUME__ */
+
+#endif /* __SPLIT_KERNEL__ */
+
+ccl_device_forceinline bool kernel_path_shader_apply(
+ KernelGlobals *kg,
+ ShaderData *sd,
+ ccl_addr_space PathState *state,
+ ccl_addr_space Ray *ray,
+ float3 throughput,
+ ShaderData *emission_sd,
+ PathRadiance *L,
+ ccl_global float *buffer)
+{
+#ifdef __SHADOW_TRICKS__
+ if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+ if(state->flag & PATH_RAY_CAMERA) {
+ state->flag |= (PATH_RAY_SHADOW_CATCHER |
+ PATH_RAY_STORE_SHADOW_INFO);
+
+ float3 bg = make_float3(0.0f, 0.0f, 0.0f);
+ if(!kernel_data.background.transparent) {
+ bg = indirect_background(kg, emission_sd, state, ray);
+ }
+ path_radiance_accum_shadowcatcher(L, throughput, bg);
+ }
+ }
+ else if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+ /* Only update transparency after shadow catcher bounce. */
+ L->shadow_transparency *=
+ average(shader_bsdf_transparency(kg, sd));
+ }
+#endif /* __SHADOW_TRICKS__ */
+
+ /* holdout */
+#ifdef __HOLDOUT__
+ if(((sd->flag & SD_HOLDOUT) ||
+ (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
+ (state->flag & PATH_RAY_CAMERA))
+ {
+ if(kernel_data.background.transparent) {
+ float3 holdout_weight;
+ if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+ holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+ }
+ else {
+ holdout_weight = shader_holdout_eval(kg, sd);
+ }
+ /* any throughput is ok, should all be identical here */
+ L->transparent += average(holdout_weight*throughput);
+ }
+
+ if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+ return false;
+ }
+ }
+#endif /* __HOLDOUT__ */
+
+ /* holdout mask objects do not write data passes */
+ kernel_write_data_passes(kg, buffer, L, sd, state, throughput);
+
+ /* blurring of bsdf after bounces, for rays that have a small likelihood
+ * of following this particular path (diffuse, rough glossy) */
+ if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+ float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+
+ if(blur_pdf < 1.0f) {
+ float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+ shader_bsdf_blur(kg, sd, blur_roughness);
+ }
+ }
+
+#ifdef __EMISSION__
+ /* emission */
+ if(sd->flag & SD_EMISSION) {
+ float3 emission = indirect_primitive_emission(kg, sd, sd->ray_length, state->flag, state->ray_pdf);
+ path_radiance_accum_emission(L, state, throughput, emission);
+ }
+#endif /* __EMISSION__ */
+
+ return true;
+}
ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
PathRadiance *L,
ccl_addr_space PathState *state,
- RNG *rng,
float3 throughput,
float3 ao_alpha)
{
/* todo: solve correlation */
float bsdf_u, bsdf_v;
- path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+ path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
float ao_factor = kernel_data.background.ao_factor;
float3 ao_N;
@@ -83,13 +366,11 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
light_ray.P = ray_offset(sd->P, sd->Ng);
light_ray.D = ao_D;
light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
light_ray.time = sd->time;
-#endif /* __OBJECT_MOTION__ */
light_ray.dP = sd->dP;
light_ray.dD = differential3_zero();
- if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
+ if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
}
else {
@@ -100,265 +381,85 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
#ifndef __SPLIT_KERNEL__
+#if defined(__BRANCHED_PATH__) || defined(__BAKING__)
+
ccl_device void kernel_path_indirect(KernelGlobals *kg,
ShaderData *sd,
ShaderData *emission_sd,
- RNG *rng,
Ray *ray,
float3 throughput,
- int num_samples,
PathState *state,
PathRadiance *L)
{
/* path iteration */
for(;;) {
- /* intersect scene */
+ /* Find intersection with objects in scene. */
Intersection isect;
- uint visibility = path_state_ray_visibility(kg, state);
- if(state->bounce > kernel_data.integrator.ao_bounces) {
- visibility = PATH_RAY_SHADOW;
- ray->t = kernel_data.background.ao_distance;
- }
- bool hit = scene_intersect(kg,
- *ray,
- visibility,
- &isect,
- NULL,
- 0.0f, 0.0f);
+ bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L);
-#ifdef __LAMP_MIS__
- if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
- /* ray starting from previous non-transparent bounce */
- Ray light_ray;
-
- light_ray.P = ray->P - state->ray_t*ray->D;
- state->ray_t += isect.t;
- light_ray.D = ray->D;
- light_ray.t = state->ray_t;
- light_ray.time = ray->time;
- light_ray.dD = ray->dD;
- light_ray.dP = ray->dP;
-
- /* intersect with lamp */
- float3 emission;
- if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) {
- path_radiance_accum_emission(L,
- throughput,
- emission,
- state->bounce);
- }
- }
-#endif /* __LAMP_MIS__ */
+ /* Find intersection with lamps and compute emission for MIS. */
+ kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L);
#ifdef __VOLUME__
- /* Sanitize volume stack. */
- if(!hit) {
- kernel_volume_clean_stack(kg, state->volume_stack);
- }
- /* volume attenuation, emission, scatter */
- if(state->volume_stack[0].shader != SHADER_NONE) {
- Ray volume_ray = *ray;
- volume_ray.t = (hit)? isect.t: FLT_MAX;
-
- bool heterogeneous =
- volume_stack_is_heterogeneous(kg,
- state->volume_stack);
-
-# ifdef __VOLUME_DECOUPLED__
- int sampling_method =
- volume_stack_sampling_method(kg,
- state->volume_stack);
- bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
-
- if(decoupled) {
- /* cache steps along volume for repeated sampling */
- VolumeSegment volume_segment;
-
- shader_setup_from_volume(kg,
- sd,
- &volume_ray);
- kernel_volume_decoupled_record(kg,
- state,
- &volume_ray,
- sd,
- &volume_segment,
- heterogeneous);
-
- volume_segment.sampling_method = sampling_method;
-
- /* emission */
- if(volume_segment.closure_flag & SD_EMISSION) {
- path_radiance_accum_emission(L,
- throughput,
- volume_segment.accum_emission,
- state->bounce);
- }
-
- /* scattering */
- VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
- if(volume_segment.closure_flag & SD_SCATTER) {
- int all = kernel_data.integrator.sample_all_lights_indirect;
-
- /* direct light sampling */
- kernel_branched_path_volume_connect_light(kg,
- rng,
- sd,
- emission_sd,
- throughput,
- state,
- L,
- all,
- &volume_ray,
- &volume_segment);
-
- /* indirect sample. if we use distance sampling and take just
- * one sample for direct and indirect light, we could share
- * this computation, but makes code a bit complex */
- float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
- float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
- result = kernel_volume_decoupled_scatter(kg,
- state,
- &volume_ray,
- sd,
- &throughput,
- rphase,
- rscatter,
- &volume_segment,
- NULL,
- true);
- }
-
- /* free cached steps */
- kernel_volume_decoupled_free(kg, &volume_segment);
-
- if(result == VOLUME_PATH_SCATTERED) {
- if(kernel_path_volume_bounce(kg,
- rng,
- sd,
- &throughput,
- state,
- L,
- ray))
- {
- continue;
- }
- else {
- break;
- }
- }
- else {
- throughput *= volume_segment.accum_transmittance;
- }
- }
- else
-# endif /* __VOLUME_DECOUPLED__ */
- {
- /* integrate along volume segment with distance sampling */
- VolumeIntegrateResult result = kernel_volume_integrate(
- kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous);
-
-# ifdef __VOLUME_SCATTER__
- if(result == VOLUME_PATH_SCATTERED) {
- /* direct lighting */
- kernel_path_volume_connect_light(kg,
- rng,
- sd,
- emission_sd,
- throughput,
- state,
- L);
-
- /* indirect light bounce */
- if(kernel_path_volume_bounce(kg,
- rng,
- sd,
- &throughput,
- state,
- L,
- ray))
- {
- continue;
- }
- else {
- break;
- }
- }
-# endif /* __VOLUME_SCATTER__ */
- }
+ /* Volume integration. */
+ VolumeIntegrateResult result = kernel_path_volume(kg,
+ sd,
+ state,
+ ray,
+ &throughput,
+ &isect,
+ hit,
+ emission_sd,
+ L);
+
+ if(result == VOLUME_PATH_SCATTERED) {
+ continue;
+ }
+ else if(result == VOLUME_PATH_MISSED) {
+ break;
}
-#endif /* __VOLUME__ */
+#endif /* __VOLUME__*/
+ /* Shade background. */
if(!hit) {
-#ifdef __BACKGROUND__
- /* sample background shader */
- float3 L_background = indirect_background(kg, emission_sd, state, ray);
- path_radiance_accum_background(L,
- state,
- throughput,
- L_background);
-#endif /* __BACKGROUND__ */
-
+ kernel_path_background(kg, state, ray, throughput, emission_sd, L);
break;
}
- else if(state->bounce > kernel_data.integrator.ao_bounces) {
+ else if(path_state_ao_bounce(kg, state)) {
break;
}
- /* setup shading */
+ /* Setup and evaluate shader. */
shader_setup_from_ray(kg,
sd,
&isect,
ray);
- float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
- shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
-#ifdef __BRANCHED_PATH__
- shader_merge_closures(sd);
-#endif /* __BRANCHED_PATH__ */
-
-#ifdef __SHADOW_TRICKS__
- if(!(sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
- state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
- }
-#endif /* __SHADOW_TRICKS__ */
-
- /* blurring of bsdf after bounces, for rays that have a small likelihood
- * of following this particular path (diffuse, rough glossy) */
- if(kernel_data.integrator.filter_glossy != FLT_MAX) {
- float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
-
- if(blur_pdf < 1.0f) {
- float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
- shader_bsdf_blur(kg, sd, blur_roughness);
- }
- }
-
-#ifdef __EMISSION__
- /* emission */
- if(sd->flag & SD_EMISSION) {
- float3 emission = indirect_primitive_emission(kg,
- sd,
- isect.t,
- state->flag,
- state->ray_pdf);
- path_radiance_accum_emission(L, throughput, emission, state->bounce);
+ shader_eval_surface(kg, sd, state, state->flag);
+ shader_prepare_closures(sd, state);
+
+ /* Apply shadow catcher, holdout, emission. */
+ if(!kernel_path_shader_apply(kg,
+ sd,
+ state,
+ ray,
+ throughput,
+ emission_sd,
+ L,
+ NULL))
+ {
+ break;
}
-#endif /* __EMISSION__ */
/* path termination. this is a strange place to put the termination, it's
* mainly due to the mixed in MIS that we use. gives too many unneeded
* shader evaluations, only need emission if we are going to terminate */
- float probability =
- path_state_terminate_probability(kg,
- state,
- throughput*num_samples);
+ float probability = path_state_continuation_probability(kg, state, throughput);
if(probability == 0.0f) {
break;
}
else if(probability != 1.0f) {
- float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+ float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
if(terminate >= probability)
break;
@@ -371,7 +472,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
- kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f));
+ kernel_path_ao(kg, sd, emission_sd, L, state, throughput, make_float3(0.0f, 0.0f, 0.0f));
}
#endif /* __AO__ */
@@ -379,22 +480,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
/* bssrdf scatter to a different location on the same object, replacing
* the closures with a diffuse BSDF */
if(sd->flag & SD_BSSRDF) {
- float bssrdf_probability;
- ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+ float bssrdf_u, bssrdf_v;
+ path_state_rng_2D(kg,
+ state,
+ PRNG_BSDF_U,
+ &bssrdf_u, &bssrdf_v);
- /* modify throughput for picking bssrdf or bsdf */
- throughput *= bssrdf_probability;
+ const ShaderClosure *sc = shader_bssrdf_pick(sd, &throughput, &bssrdf_u);
/* do bssrdf scatter step if we picked a bssrdf closure */
if(sc) {
- uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
-
- float bssrdf_u, bssrdf_v;
- path_state_rng_2D(kg,
- rng,
- state,
- PRNG_BSDF_U,
- &bssrdf_u, &bssrdf_v);
+ uint lcg_state = lcg_state_init(state, 0x68bc21eb);
+
subsurface_scatter_step(kg,
sd,
state,
@@ -412,7 +509,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
int all = (kernel_data.integrator.sample_all_lights_indirect) ||
(state->flag & PATH_RAY_SHADOW_CATCHER);
kernel_branched_path_surface_connect_light(kg,
- rng,
sd,
emission_sd,
state,
@@ -423,38 +519,24 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
}
#endif /* defined(__EMISSION__) */
- if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
+ if(!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray))
break;
}
}
+#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
-ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
- RNG *rng,
- int sample,
- Ray ray,
- ccl_global float *buffer,
- PathRadiance *L,
- bool *is_shadow_catcher)
+ccl_device_forceinline void kernel_path_integrate(
+ KernelGlobals *kg,
+ PathState *state,
+ float3 throughput,
+ Ray *ray,
+ PathRadiance *L,
+ ccl_global float *buffer,
+ ShaderData *emission_sd)
{
- /* initialize */
- float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
- float L_transparent = 0.0f;
-
- path_radiance_init(L, kernel_data.film.use_light_pass);
-
- /* shader data memory used for both volumes and surfaces, saves stack space */
+ /* Shader data memory used for both volumes and surfaces, saves stack space. */
ShaderData sd;
- /* shader data used by emission, shadows, volume stacks */
- ShaderData emission_sd;
-
- PathState state;
- path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
-
-#ifdef __KERNEL_DEBUG__
- DebugData debug_data;
- debug_data_init(&debug_data);
-#endif /* __KERNEL_DEBUG__ */
#ifdef __SUBSURFACE__
SubsurfaceIndirectRays ss_indirect;
@@ -465,265 +547,82 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
/* path iteration */
for(;;) {
- /* intersect scene */
+ /* Find intersection with objects in scene. */
Intersection isect;
- uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
- float difl = 0.0f, extmax = 0.0f;
- uint lcg_state = 0;
-
- if(kernel_data.bvh.have_curves) {
- if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
- float3 pixdiff = ray.dD.dx + ray.dD.dy;
- /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
- difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
- }
-
- extmax = kernel_data.curve.maximum_width;
- lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d);
- }
-
- if(state.bounce > kernel_data.integrator.ao_bounces) {
- visibility = PATH_RAY_SHADOW;
- ray.t = kernel_data.background.ao_distance;
- }
-
- bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
- bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif /* __HAIR__ */
+ bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L);
-#ifdef __KERNEL_DEBUG__
- if(state.flag & PATH_RAY_CAMERA) {
- debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes;
- debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
- debug_data.num_bvh_intersections += isect.num_intersections;
- }
- debug_data.num_ray_bounces++;
-#endif /* __KERNEL_DEBUG__ */
-
-#ifdef __LAMP_MIS__
- if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
- /* ray starting from previous non-transparent bounce */
- Ray light_ray;
-
- light_ray.P = ray.P - state.ray_t*ray.D;
- state.ray_t += isect.t;
- light_ray.D = ray.D;
- light_ray.t = state.ray_t;
- light_ray.time = ray.time;
- light_ray.dD = ray.dD;
- light_ray.dP = ray.dP;
-
- /* intersect with lamp */
- float3 emission;
-
- if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
- path_radiance_accum_emission(L, throughput, emission, state.bounce);
- }
-#endif /* __LAMP_MIS__ */
+ /* Find intersection with lamps and compute emission for MIS. */
+ kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L);
#ifdef __VOLUME__
- /* Sanitize volume stack. */
- if(!hit) {
- kernel_volume_clean_stack(kg, state.volume_stack);
- }
- /* volume attenuation, emission, scatter */
- if(state.volume_stack[0].shader != SHADER_NONE) {
- Ray volume_ray = ray;
- volume_ray.t = (hit)? isect.t: FLT_MAX;
-
- bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-# ifdef __VOLUME_DECOUPLED__
- int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
- bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
-
- if(decoupled) {
- /* cache steps along volume for repeated sampling */
- VolumeSegment volume_segment;
-
- shader_setup_from_volume(kg, &sd, &volume_ray);
- kernel_volume_decoupled_record(kg, &state,
- &volume_ray, &sd, &volume_segment, heterogeneous);
-
- volume_segment.sampling_method = sampling_method;
-
- /* emission */
- if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
-
- /* scattering */
- VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
- if(volume_segment.closure_flag & SD_SCATTER) {
- int all = false;
-
- /* direct light sampling */
- kernel_branched_path_volume_connect_light(kg, rng, &sd,
- &emission_sd, throughput, &state, L, all,
- &volume_ray, &volume_segment);
-
- /* indirect sample. if we use distance sampling and take just
- * one sample for direct and indirect light, we could share
- * this computation, but makes code a bit complex */
- float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
- float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
-
- result = kernel_volume_decoupled_scatter(kg,
- &state, &volume_ray, &sd, &throughput,
- rphase, rscatter, &volume_segment, NULL, true);
- }
-
- /* free cached steps */
- kernel_volume_decoupled_free(kg, &volume_segment);
-
- if(result == VOLUME_PATH_SCATTERED) {
- if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
- continue;
- else
- break;
- }
- else {
- throughput *= volume_segment.accum_transmittance;
- }
- }
- else
-# endif /* __VOLUME_DECOUPLED__ */
- {
- /* integrate along volume segment with distance sampling */
- VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
-
-# ifdef __VOLUME_SCATTER__
- if(result == VOLUME_PATH_SCATTERED) {
- /* direct lighting */
- kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
-
- /* indirect light bounce */
- if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
- continue;
- else
- break;
- }
-# endif /* __VOLUME_SCATTER__ */
- }
+ /* Volume integration. */
+ VolumeIntegrateResult result = kernel_path_volume(kg,
+ &sd,
+ state,
+ ray,
+ &throughput,
+ &isect,
+ hit,
+ emission_sd,
+ L);
+
+ if(result == VOLUME_PATH_SCATTERED) {
+ continue;
+ }
+ else if(result == VOLUME_PATH_MISSED) {
+ break;
}
-#endif /* __VOLUME__ */
+#endif /* __VOLUME__*/
+ /* Shade background. */
if(!hit) {
- /* eval background shader if nothing hit */
- if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) {
- L_transparent += average(throughput);
-
-#ifdef __PASSES__
- if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif /* __PASSES__ */
- break;
- }
-
-#ifdef __BACKGROUND__
- /* sample background shader */
- float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(L, &state, throughput, L_background);
-#endif /* __BACKGROUND__ */
-
+ kernel_path_background(kg, state, ray, throughput, emission_sd, L);
break;
}
- else if(state.bounce > kernel_data.integrator.ao_bounces) {
+ else if(path_state_ao_bounce(kg, state)) {
break;
}
- /* setup shading */
- shader_setup_from_ray(kg, &sd, &isect, &ray);
- float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
- shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
-
-#ifdef __SHADOW_TRICKS__
- if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
- if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
- state.catcher_object = sd.object;
- if(!kernel_data.background.transparent) {
- L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
- }
- }
- }
- else {
- state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
- }
-#endif /* __SHADOW_TRICKS__ */
-
- /* holdout */
-#ifdef __HOLDOUT__
- if(((sd.flag & SD_HOLDOUT) ||
- (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
- (state.flag & PATH_RAY_CAMERA))
+ /* Setup and evaluate shader. */
+ shader_setup_from_ray(kg, &sd, &isect, ray);
+ shader_eval_surface(kg, &sd, state, state->flag);
+ shader_prepare_closures(&sd, state);
+
+ /* Apply shadow catcher, holdout, emission. */
+ if(!kernel_path_shader_apply(kg,
+ &sd,
+ state,
+ ray,
+ throughput,
+ emission_sd,
+ L,
+ buffer))
{
- if(kernel_data.background.transparent) {
- float3 holdout_weight;
- if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
- holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
- }
- else {
- holdout_weight = shader_holdout_eval(kg, &sd);
- }
- /* any throughput is ok, should all be identical here */
- L_transparent += average(holdout_weight*throughput);
- }
-
- if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
- break;
- }
- }
-#endif /* __HOLDOUT__ */
-
- /* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
-
- /* blurring of bsdf after bounces, for rays that have a small likelihood
- * of following this particular path (diffuse, rough glossy) */
- if(kernel_data.integrator.filter_glossy != FLT_MAX) {
- float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf;
-
- if(blur_pdf < 1.0f) {
- float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
- shader_bsdf_blur(kg, &sd, blur_roughness);
- }
- }
-
-#ifdef __EMISSION__
- /* emission */
- if(sd.flag & SD_EMISSION) {
- /* todo: is isect.t wrong here for transparent surfaces? */
- float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
- path_radiance_accum_emission(L, throughput, emission, state.bounce);
+ break;
}
-#endif /* __EMISSION__ */
/* path termination. this is a strange place to put the termination, it's
* mainly due to the mixed in MIS that we use. gives too many unneeded
* shader evaluations, only need emission if we are going to terminate */
- float probability = path_state_terminate_probability(kg, &state, throughput);
+ float probability = path_state_continuation_probability(kg, state, throughput);
if(probability == 0.0f) {
break;
}
else if(probability != 1.0f) {
- float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+ float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
if(terminate >= probability)
break;
throughput /= probability;
}
- kernel_update_denoising_features(kg, &sd, &state, L);
+ kernel_update_denoising_features(kg, &sd, state, L);
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+ kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd));
}
#endif /* __AO__ */
@@ -733,11 +632,10 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
if(sd.flag & SD_BSSRDF) {
if(kernel_path_subsurface_scatter(kg,
&sd,
- &emission_sd,
+ emission_sd,
L,
- &state,
- rng,
- &ray,
+ state,
+ ray,
&throughput,
&ss_indirect))
{
@@ -747,24 +645,22 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
#endif /* __SUBSURFACE__ */
/* direct lighting */
- kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
+ kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
/* compute direct lighting and next bounce */
- if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
+ if(!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray))
break;
}
#ifdef __SUBSURFACE__
- kernel_path_subsurface_accum_indirect(&ss_indirect, L);
-
/* Trace indirect subsurface rays by restarting the loop. this uses less
* stack memory than invoking kernel_path_indirect.
*/
if(ss_indirect.num_rays) {
kernel_path_subsurface_setup_indirect(kg,
&ss_indirect,
- &state,
- &ray,
+ state,
+ ray,
L,
&throughput);
}
@@ -773,16 +669,6 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
}
}
#endif /* __SUBSURFACE__ */
-
-#ifdef __SHADOW_TRICKS__
- *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
-#endif /* __SHADOW_TRICKS__ */
-
-#ifdef __KERNEL_DEBUG__
- kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif /* __KERNEL_DEBUG__ */
-
- return 1.0f - L_transparent;
}
ccl_device void kernel_path_trace(KernelGlobals *kg,
@@ -796,25 +682,37 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
rng_state += index;
buffer += index*pass_stride;
- /* initialize random numbers and ray */
- RNG rng;
+ /* Initialize random numbers and sample ray. */
+ uint rng_hash;
Ray ray;
- kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
-
- /* integrate */
- PathRadiance L;
- bool is_shadow_catcher;
+ kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, &ray);
- if(ray.t != 0.0f) {
- float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
- kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
- }
- else {
- kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+ if(ray.t == 0.0f) {
+ kernel_write_result(kg, buffer, sample, NULL);
+ return;
}
- path_rng_end(kg, rng_state, rng);
+ /* Initialize state. */
+ float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+
+ PathRadiance L;
+ path_radiance_init(&L, kernel_data.film.use_light_pass);
+
+ ShaderData emission_sd;
+ PathState state;
+ path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
+
+ /* Integrate. */
+ kernel_path_integrate(kg,
+ &state,
+ throughput,
+ &ray,
+ &L,
+ buffer,
+ &emission_sd);
+
+ kernel_write_result(kg, buffer, sample, &L);
}
#endif /* __SPLIT_KERNEL__ */
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 10816d3e5d1..6e0ec22d581 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -23,7 +23,6 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
ShaderData *emission_sd,
PathRadiance *L,
ccl_addr_space PathState *state,
- RNG *rng,
float3 throughput)
{
int num_samples = kernel_data.integrator.ao_samples;
@@ -35,7 +34,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
for(int j = 0; j < num_samples; j++) {
float bsdf_u, bsdf_v;
- path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+ path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
float3 ao_D;
float ao_pdf;
@@ -49,13 +48,11 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
light_ray.P = ray_offset(sd->P, sd->Ng);
light_ray.D = ao_D;
light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
light_ray.time = sd->time;
-#endif /* __OBJECT_MOTION__ */
light_ray.dP = sd->dP;
light_ray.dD = differential3_zero();
- if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
+ if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
}
else {
@@ -69,7 +66,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
/* bounce off surface and integrate indirect light */
ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
- RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
+ ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
{
float sum_sample_weight = 0.0f;
@@ -113,35 +110,38 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
num_samples = ceil_to_int(num_samples_adjust*num_samples);
float num_samples_inv = num_samples_adjust/num_samples;
- RNG bsdf_rng = cmj_hash(*rng, i);
for(int j = 0; j < num_samples; j++) {
PathState ps = *state;
float3 tp = throughput;
Ray bsdf_ray;
+#ifdef __SHADOW_TRICKS__
+ float shadow_transparency = L->shadow_transparency;
+#endif
+
+ ps.rng_hash = cmj_hash(state->rng_hash, i);
if(!kernel_branched_path_surface_bounce(kg,
- &bsdf_rng,
sd,
sc,
j,
num_samples,
&tp,
&ps,
- L,
+ &L->state,
&bsdf_ray,
sum_sample_weight))
{
continue;
}
+ ps.rng_hash = state->rng_hash;
+
kernel_path_indirect(kg,
indirect_sd,
emission_sd,
- rng,
&bsdf_ray,
tp*num_samples_inv,
- num_samples,
&ps,
L);
@@ -149,6 +149,10 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
* for the next samples */
path_radiance_sum_indirect(L);
path_radiance_reset_indirect(L);
+
+#ifdef __SHADOW_TRICKS__
+ L->shadow_transparency = shadow_transparency;
+#endif
}
}
}
@@ -160,7 +164,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
ShaderData *emission_sd,
PathRadiance *L,
PathState *state,
- RNG *rng,
Ray *ray,
float3 throughput)
{
@@ -171,17 +174,17 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
continue;
/* set up random number generator */
- uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
+ uint lcg_state = lcg_state_init(state, 0x68bc21eb);
int num_samples = kernel_data.integrator.subsurface_samples;
float num_samples_inv = 1.0f/num_samples;
- RNG bssrdf_rng = cmj_hash(*rng, i);
+ uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i);
/* do subsurface scatter step with copy of shader data, this will
* replace the BSSRDF with a diffuse BSDF closure */
for(int j = 0; j < num_samples; j++) {
SubsurfaceIntersection ss_isect;
float bssrdf_u, bssrdf_v;
- path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+ path_branched_rng_2D(kg, bssrdf_rng_hash, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
int num_hits = subsurface_scatter_multi_intersect(kg,
&ss_isect,
sd,
@@ -234,7 +237,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
(state->flag & PATH_RAY_SHADOW_CATCHER);
kernel_branched_path_surface_connect_light(
kg,
- rng,
&bssrdf_sd,
emission_sd,
&hit_state,
@@ -248,7 +250,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
/* indirect light */
kernel_branched_path_surface_indirect_light(
kg,
- rng,
&bssrdf_sd,
indirect_sd,
emission_sd,
@@ -262,17 +263,15 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
-ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
- RNG *rng,
- int sample,
- Ray ray,
- ccl_global float *buffer,
- PathRadiance *L,
- bool *is_shadow_catcher)
+ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
+ uint rng_hash,
+ int sample,
+ Ray ray,
+ ccl_global float *buffer,
+ PathRadiance *L)
{
/* initialize */
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
- float L_transparent = 0.0f;
path_radiance_init(L, kernel_data.film.use_light_pass);
@@ -282,48 +281,16 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
ShaderData emission_sd, indirect_sd;
PathState state;
- path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
-
-#ifdef __KERNEL_DEBUG__
- DebugData debug_data;
- debug_data_init(&debug_data);
-#endif /* __KERNEL_DEBUG__ */
+ path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
/* Main Loop
* Here we only handle transparency intersections from the camera ray.
* Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
*/
for(;;) {
- /* intersect scene */
+ /* Find intersection with objects in scene. */
Intersection isect;
- uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
- float difl = 0.0f, extmax = 0.0f;
- uint lcg_state = 0;
-
- if(kernel_data.bvh.have_curves) {
- if(kernel_data.cam.resolution == 1) {
- float3 pixdiff = ray.dD.dx + ray.dD.dy;
- /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
- difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
- }
-
- extmax = kernel_data.curve.maximum_width;
- lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d);
- }
-
- bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
- bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif /* __HAIR__ */
-
-#ifdef __KERNEL_DEBUG__
- debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes;
- debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
- debug_data.num_bvh_intersections += isect.num_intersections;
- debug_data.num_ray_bounces++;
-#endif /* __KERNEL_DEBUG__ */
+ bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L);
#ifdef __VOLUME__
/* Sanitize volume stack. */
@@ -353,7 +320,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
int all = kernel_data.integrator.sample_all_lights_direct;
- kernel_branched_path_volume_connect_light(kg, rng, &sd,
+ kernel_branched_path_volume_connect_light(kg, &sd,
&emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
@@ -372,30 +339,25 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
/* scatter sample. if we use distance sampling and take just one
* sample for direct and indirect light, we could share this
* computation, but makes code a bit complex */
- float rphase = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_PHASE);
- float rscatter = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
+ float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
- (void)result;
- kernel_assert(result == VOLUME_PATH_SCATTERED);
-
- if(kernel_path_volume_bounce(kg,
- rng,
+ if(result == VOLUME_PATH_SCATTERED &&
+ kernel_path_volume_bounce(kg,
&sd,
&tp,
&ps,
- L,
+ &L->state,
&pray))
{
kernel_path_indirect(kg,
&indirect_sd,
&emission_sd,
- rng,
&pray,
tp*num_samples_inv,
- num_samples,
&ps,
L);
@@ -409,7 +371,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
/* emission and transmittance */
if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
+ path_radiance_accum_emission(L, &state, throughput, volume_segment.accum_emission);
throughput *= volume_segment.accum_transmittance;
/* free cached steps */
@@ -431,29 +393,26 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
path_state_branch(&ps, j, num_samples);
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
+ kg, &ps, &sd, &volume_ray, L, &tp, heterogeneous);
#ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* todo: support equiangular, MIS and all light sampling.
* alternatively get decoupled ray marching working on the GPU */
- kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
+ kernel_path_volume_connect_light(kg, &sd, &emission_sd, tp, &state, L);
if(kernel_path_volume_bounce(kg,
- rng,
&sd,
&tp,
&ps,
- L,
+ &L->state,
&pray))
{
kernel_path_indirect(kg,
&indirect_sd,
&emission_sd,
- rng,
&pray,
tp,
- num_samples,
&ps,
L);
@@ -472,89 +431,42 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
}
#endif /* __VOLUME__ */
+ /* Shade background. */
if(!hit) {
- /* eval background shader if nothing hit */
- if(kernel_data.background.transparent) {
- L_transparent += average(throughput);
-
-#ifdef __PASSES__
- if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif /* __PASSES__ */
- break;
- }
-
-#ifdef __BACKGROUND__
- /* sample background shader */
- float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(L, &state, throughput, L_background);
-#endif /* __BACKGROUND__ */
-
+ kernel_path_background(kg, &state, &ray, throughput, &emission_sd, L);
break;
}
- /* setup shading */
+ /* Setup and evaluate shader. */
shader_setup_from_ray(kg, &sd, &isect, &ray);
- shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
+ shader_eval_surface(kg, &sd, &state, state.flag);
shader_merge_closures(&sd);
-#ifdef __SHADOW_TRICKS__
- if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
- if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
- state.catcher_object = sd.object;
- if(!kernel_data.background.transparent) {
- L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
- }
- }
- }
- else {
- state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
- }
-#endif /* __SHADOW_TRICKS__ */
-
- /* holdout */
-#ifdef __HOLDOUT__
- if((sd.flag & SD_HOLDOUT) || (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) {
- if(kernel_data.background.transparent) {
- float3 holdout_weight;
- if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
- holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
- }
- else {
- holdout_weight = shader_holdout_eval(kg, &sd);
- }
- /* any throughput is ok, should all be identical here */
- L_transparent += average(holdout_weight*throughput);
- }
- if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
- break;
- }
- }
-#endif /* __HOLDOUT__ */
-
- /* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
-
-#ifdef __EMISSION__
- /* emission */
- if(sd.flag & SD_EMISSION) {
- float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
- path_radiance_accum_emission(L, throughput, emission, state.bounce);
+ /* Apply shadow catcher, holdout, emission. */
+ if(!kernel_path_shader_apply(kg,
+ &sd,
+ &state,
+ &ray,
+ throughput,
+ &emission_sd,
+ L,
+ buffer))
+ {
+ break;
}
-#endif /* __EMISSION__ */
/* transparency termination */
if(state.flag & PATH_RAY_TRANSPARENT) {
/* path termination. this is a strange place to put the termination, it's
* mainly due to the mixed in MIS that we use. gives too many unneeded
* shader evaluations, only need emission if we are going to terminate */
- float probability = path_state_terminate_probability(kg, &state, throughput);
+ float probability = path_state_continuation_probability(kg, &state, throughput);
if(probability == 0.0f) {
break;
}
else if(probability != 1.0f) {
- float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+ float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
if(terminate >= probability)
break;
@@ -568,7 +480,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
+ kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, throughput);
}
#endif /* __AO__ */
@@ -576,7 +488,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
/* bssrdf scatter to a different location on the same object */
if(sd.flag & SD_BSSRDF) {
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
- L, &state, rng, &ray, throughput);
+ L, &state, &ray, throughput);
}
#endif /* __SUBSURFACE__ */
@@ -588,13 +500,13 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
if(kernel_data.integrator.use_direct_light) {
int all = (kernel_data.integrator.sample_all_lights_direct) ||
(state.flag & PATH_RAY_SHADOW_CATCHER);
- kernel_branched_path_surface_connect_light(kg, rng,
+ kernel_branched_path_surface_connect_light(kg,
&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
}
#endif /* __EMISSION__ */
/* indirect light */
- kernel_branched_path_surface_indirect_light(kg, rng,
+ kernel_branched_path_surface_indirect_light(kg,
&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
/* continue in case of transparency */
@@ -623,16 +535,6 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
#endif /* __VOLUME__ */
}
-
-#ifdef __SHADOW_TRICKS__
- *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
-#endif /* __SHADOW_TRICKS__ */
-
-#ifdef __KERNEL_DEBUG__
- kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif /* __KERNEL_DEBUG__ */
-
- return 1.0f - L_transparent;
}
ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@@ -647,24 +549,21 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
buffer += index*pass_stride;
/* initialize random numbers and ray */
- RNG rng;
+ uint rng_hash;
Ray ray;
- kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
+ kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, &ray);
/* integrate */
PathRadiance L;
- bool is_shadow_catcher;
if(ray.t != 0.0f) {
- float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
- kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+ kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L);
+ kernel_write_result(kg, buffer, sample, &L);
}
else {
- kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+ kernel_write_result(kg, buffer, sample, NULL);
}
-
- path_rng_end(kg, rng_state, rng);
}
#endif /* __SPLIT_KERNEL__ */
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
index 82f83deb595..54dd278a185 100644
--- a/intern/cycles/kernel/kernel_path_common.h
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
ccl_global uint *rng_state,
int sample,
int x, int y,
- RNG *rng,
+ uint *rng_hash,
ccl_addr_space Ray *ray)
{
float filter_u;
@@ -34,20 +34,20 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
*rng_state = hash_int_2d(x, y);
}
- path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
+ path_rng_init(kg, rng_state, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v);
/* sample camera ray */
float lens_u = 0.0f, lens_v = 0.0f;
if(kernel_data.cam.aperturesize > 0.0f)
- path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
+ path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
float time = 0.0f;
#ifdef __CAMERA_MOTION__
if(kernel_data.cam.shuttertime != -1.0f)
- time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
+ time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME);
#endif
camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 5d92fd12201..eccee54c0e3 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -19,15 +19,17 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline void path_state_init(KernelGlobals *kg,
ShaderData *stack_sd,
ccl_addr_space PathState *state,
- RNG *rng,
+ uint rng_hash,
int sample,
ccl_addr_space Ray *ray)
{
state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;
+ state->rng_hash = rng_hash;
state->rng_offset = PRNG_BASE_NUM;
state->sample = sample;
state->num_samples = kernel_data.integrator.aa_samples;
+ state->branch_factor = 1.0f;
state->bounce = 0;
state->diffuse_bounce = 0;
@@ -58,16 +60,12 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
/* Initialize volume stack with volume we are inside of. */
kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack);
/* Seed RNG for cases where we can't use stratified samples .*/
- state->rng_congruential = lcg_init(*rng + sample*0x51633e2d);
+ state->rng_congruential = lcg_init(rng_hash + sample*0x51633e2d);
}
else {
state->volume_stack[0].shader = SHADER_NONE;
}
#endif
-
-#ifdef __SHADOW_TRICKS__
- state->catcher_object = OBJECT_NONE;
-#endif
}
ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label)
@@ -78,12 +76,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
state->flag |= PATH_RAY_TRANSPARENT;
state->transparent_bounce++;
- /* don't increase random number generator offset here, to avoid some
- * unwanted patterns, see path_state_rng_1D_for_decision */
-
if(!kernel_data.integrator.transparent_shadows)
state->flag |= PATH_RAY_MIS_SKIP;
+ /* random number generator next bounce */
+ state->rng_offset += PRNG_BOUNCE_NUM;
+
return;
}
@@ -146,7 +144,7 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
#endif
}
-ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
+ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, ccl_addr_space PathState *state)
{
uint flag = state->flag & PATH_RAY_ALL_VISIBILITY;
@@ -160,17 +158,28 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
return flag;
}
-ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput)
+ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
+ ccl_addr_space PathState *state,
+ const float3 throughput)
{
if(state->flag & PATH_RAY_TRANSPARENT) {
- /* transparent rays treated separately */
- if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
+ /* Transparent rays are treated separately with own max bounces. */
+ if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
return 0.0f;
- else if(state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce)
+ }
+ /* Do at least one bounce without RR. */
+ else if(state->transparent_bounce <= 1) {
return 1.0f;
+ }
+#ifdef __SHADOW_TRICKS__
+ /* Exception for shadow catcher not working correctly with RR. */
+ else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
+ return 1.0f;
+ }
+#endif
}
else {
- /* other rays */
+ /* Test max bounces for various ray types. */
if((state->bounce >= kernel_data.integrator.max_bounce) ||
(state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) ||
(state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) ||
@@ -181,13 +190,21 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_
{
return 0.0f;
}
- else if(state->bounce <= kernel_data.integrator.min_bounce) {
+ /* Do at least one bounce without RR. */
+ else if(state->bounce <= 1) {
return 1.0f;
}
+#ifdef __SHADOW_TRICKS__
+ /* Exception for shadow catcher not working correctly with RR. */
+ else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
+ return 1.0f;
+ }
+#endif
}
- /* probalistic termination */
- return average(throughput); /* todo: try using max here */
+ /* Probalistic termination: use sqrt() to roughly match typical view
+ * transform and do path termination a bit later on average. */
+ return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f);
}
/* TODO(DingTo): Find more meaningful name for this */
@@ -200,5 +217,30 @@ ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state,
state->bounce -= 1;
}
+ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state)
+{
+ if(state->bounce <= kernel_data.integrator.ao_bounces) {
+ return false;
+ }
+
+ int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0);
+ return (bounce > kernel_data.integrator.ao_bounces);
+}
+
+ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
+ int branch,
+ int num_branches)
+{
+ state->rng_offset += PRNG_BOUNCE_NUM;
+
+ if(num_branches > 1) {
+ /* Path is splitting into a branch, adjust so that each branch
+ * still gets a unique sample from the same sequence. */
+ state->sample = state->sample*num_branches + branch;
+ state->num_samples = state->num_samples*num_branches;
+ state->branch_factor *= num_branches;
+ }
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
index 10b568ac3dd..1436e8e5a5b 100644
--- a/intern/cycles/kernel/kernel_path_subsurface.h
+++ b/intern/cycles/kernel/kernel_path_subsurface.h
@@ -28,16 +28,14 @@ bool kernel_path_subsurface_scatter(
ShaderData *emission_sd,
PathRadiance *L,
ccl_addr_space PathState *state,
- RNG *rng,
ccl_addr_space Ray *ray,
ccl_addr_space float3 *throughput,
ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
{
- float bssrdf_probability;
- ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+ float bssrdf_u, bssrdf_v;
+ path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
- /* modify throughput for picking bssrdf or bsdf */
- *throughput *= bssrdf_probability;
+ const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
/* do bssrdf scatter step if we picked a bssrdf closure */
if(sc) {
@@ -45,13 +43,11 @@ bool kernel_path_subsurface_scatter(
* the second one should be converted to a diffuse BSDF to
* avoid this.
*/
- kernel_assert(!ss_indirect->tracing);
+ kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR));
- uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
+ uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
SubsurfaceIntersection ss_isect;
- float bssrdf_u, bssrdf_v;
- path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
int num_hits = subsurface_scatter_multi_intersect(kg,
&ss_isect,
sd,
@@ -60,7 +56,7 @@ bool kernel_path_subsurface_scatter(
bssrdf_u, bssrdf_v,
false);
# ifdef __VOLUME__
- ss_indirect->need_update_volume_stack =
+ bool need_update_volume_stack =
kernel_data.integrator.use_volumes &&
sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
# endif /* __VOLUME__ */
@@ -79,29 +75,25 @@ bool kernel_path_subsurface_scatter(
sc,
false);
+ kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
- PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays];
+ PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays];
*hit_state = *state;
*hit_ray = *ray;
*hit_tp = *throughput;
+ *hit_L_state = L->state;
hit_state->rng_offset += PRNG_BOUNCE_NUM;
- path_radiance_init(hit_L, kernel_data.film.use_light_pass);
- hit_L->direct_throughput = L->direct_throughput;
- path_radiance_copy_indirect(hit_L, L);
-
- kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L);
-
if(kernel_path_surface_bounce(kg,
- rng,
sd,
hit_tp,
hit_state,
- hit_L,
+ hit_L_state,
hit_ray))
{
# ifdef __LAMP_MIS__
@@ -109,7 +101,7 @@ bool kernel_path_subsurface_scatter(
# endif /* __LAMP_MIS__ */
# ifdef __VOLUME__
- if(ss_indirect->need_update_volume_stack) {
+ if(need_update_volume_stack) {
Ray volume_ray = *ray;
/* Setup ray from previous surface point to the new one. */
volume_ray.D = normalize_len(hit_ray->P - volume_ray.P,
@@ -122,12 +114,8 @@ bool kernel_path_subsurface_scatter(
hit_state->volume_stack);
}
# endif /* __VOLUME__ */
- path_radiance_reset_indirect(L);
ss_indirect->num_rays++;
}
- else {
- path_radiance_accum_sample(L, hit_L, 1);
- }
}
return true;
}
@@ -137,23 +125,9 @@ bool kernel_path_subsurface_scatter(
ccl_device_inline void kernel_path_subsurface_init_indirect(
ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
{
- ss_indirect->tracing = false;
ss_indirect->num_rays = 0;
}
-ccl_device void kernel_path_subsurface_accum_indirect(
- ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
- PathRadiance *L)
-{
- if(ss_indirect->tracing) {
- path_radiance_sum_indirect(L);
- path_radiance_accum_sample(&ss_indirect->direct_L, L, 1);
- if(ss_indirect->num_rays == 0) {
- *L = ss_indirect->direct_L;
- }
- }
-}
-
ccl_device void kernel_path_subsurface_setup_indirect(
KernelGlobals *kg,
ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
@@ -162,20 +136,15 @@ ccl_device void kernel_path_subsurface_setup_indirect(
PathRadiance *L,
ccl_addr_space float3 *throughput)
{
- if(!ss_indirect->tracing) {
- ss_indirect->direct_L = *L;
- }
- ss_indirect->tracing = true;
-
/* Setup state, ray and throughput for indirect SSS rays. */
ss_indirect->num_rays--;
- ccl_addr_space Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
- PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays];
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
*state = ss_indirect->state[ss_indirect->num_rays];
- *ray = *indirect_ray;
- *L = *indirect_L;
+ *ray = ss_indirect->rays[ss_indirect->num_rays];
+ L->state = ss_indirect->L_state[ss_indirect->num_rays];
*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index dcb577e176f..7b566b01b04 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -20,7 +20,6 @@ CCL_NAMESPACE_BEGIN
/* branched path tracing: connect path directly to position on one or more lights and add it to L */
ccl_device_noinline void kernel_branched_path_surface_connect_light(
KernelGlobals *kg,
- RNG *rng,
ShaderData *sd,
ShaderData *emission_sd,
ccl_addr_space PathState *state,
@@ -50,12 +49,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
- RNG lamp_rng = cmj_hash(*rng, i);
+ uint lamp_rng_hash = cmj_hash(state->rng_hash, i);
for(int j = 0; j < num_samples; j++) {
float light_u, light_v;
- path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
- float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples);
+ path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+ float terminate = path_branched_rng_light_termination(kg, lamp_rng_hash, state, j, num_samples);
LightSample ls;
if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
@@ -68,7 +67,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
/* trace shadow ray */
float3 shadow;
- if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+ if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
@@ -86,17 +85,16 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
float num_samples_inv = num_samples_adjust/num_samples;
for(int j = 0; j < num_samples; j++) {
- float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
float light_u, light_v;
- path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
- float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+ path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+ float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
/* only sample triangle lights */
if(kernel_data.integrator.num_all_lights)
- light_t = 0.5f*light_t;
+ light_u = 0.5f*light_u;
LightSample ls;
- if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+ if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
/* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */
if(kernel_data.integrator.num_all_lights)
ls.pdf *= 2.0f;
@@ -105,7 +103,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
/* trace shadow ray */
float3 shadow;
- if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+ if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
@@ -119,19 +117,18 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
}
else {
/* sample one light at random */
- float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
float light_u, light_v;
- path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
- float terminate = path_state_rng_light_termination(kg, rng, state);
+ path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+ float terminate = path_state_rng_light_termination(kg, state);
LightSample ls;
- if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+ if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
/* sample random light */
if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
/* trace shadow ray */
float3 shadow;
- if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+ if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
}
@@ -147,14 +144,13 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
/* branched path tracing: bounce off or through surface to with new direction stored in ray */
ccl_device bool kernel_branched_path_surface_bounce(
KernelGlobals *kg,
- RNG *rng,
ShaderData *sd,
const ShaderClosure *sc,
int sample,
int num_samples,
ccl_addr_space float3 *throughput,
ccl_addr_space PathState *state,
- PathRadiance *L,
+ PathRadianceState *L_state,
ccl_addr_space Ray *ray,
float sum_sample_weight)
{
@@ -164,7 +160,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
float3 bsdf_omega_in;
differential3 bsdf_domega_in;
float bsdf_u, bsdf_v;
- path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+ path_branched_rng_2D(kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
int label;
label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
@@ -174,7 +170,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
return false;
/* modify throughput */
- path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+ path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
#ifdef __DENOISING_FEATURES__
state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
@@ -217,7 +213,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
#endif
/* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state,
PathRadiance *L)
{
@@ -228,7 +224,6 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
#ifdef __SHADOW_TRICKS__
if(state->flag & PATH_RAY_SHADOW_CATCHER) {
kernel_branched_path_surface_connect_light(kg,
- rng,
sd,
emission_sd,
state,
@@ -241,9 +236,8 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
#endif
/* sample illumination from lights to find path contribution */
- float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
float light_u, light_v;
- path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+ path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
Ray light_ray;
BsdfEval L_light;
@@ -254,13 +248,13 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
#endif
LightSample ls;
- if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
- float terminate = path_state_rng_light_termination(kg, rng, state);
+ if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+ float terminate = path_state_rng_light_termination(kg, state);
if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
/* trace shadow ray */
float3 shadow;
- if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+ if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
@@ -274,11 +268,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
/* path tracing: bounce off or through surface to with new direction stored in ray */
ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
- RNG *rng,
ShaderData *sd,
ccl_addr_space float3 *throughput,
ccl_addr_space PathState *state,
- PathRadiance *L,
+ PathRadianceState *L_state,
ccl_addr_space Ray *ray)
{
/* no BSDF? we can stop here */
@@ -289,7 +282,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
float3 bsdf_omega_in;
differential3 bsdf_domega_in;
float bsdf_u, bsdf_v;
- path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+ path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
int label;
label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval,
@@ -299,7 +292,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
return false;
/* modify throughput */
- path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+ path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
/* set labels */
if(!(label & LABEL_TRANSPARENT)) {
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index dcedf51e479..b6a856baf24 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -20,7 +20,6 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_path_volume_connect_light(
KernelGlobals *kg,
- RNG *rng,
ShaderData *sd,
ShaderData *emission_sd,
float3 throughput,
@@ -32,9 +31,8 @@ ccl_device_inline void kernel_path_volume_connect_light(
return;
/* sample illumination from lights to find path contribution */
- float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
float light_u, light_v;
- path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+ path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
Ray light_ray;
BsdfEval L_light;
@@ -42,18 +40,16 @@ ccl_device_inline void kernel_path_volume_connect_light(
bool is_lamp;
/* connect to light from given point where shader has been evaluated */
-# ifdef __OBJECT_MOTION__
light_ray.time = sd->time;
-# endif
- if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls))
+ if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls))
{
- float terminate = path_state_rng_light_termination(kg, rng, state);
+ float terminate = path_state_rng_light_termination(kg, state);
if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
/* trace shadow ray */
float3 shadow;
- if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+ if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
@@ -69,11 +65,10 @@ ccl_device
#endif
bool kernel_path_volume_bounce(
KernelGlobals *kg,
- RNG *rng,
ShaderData *sd,
ccl_addr_space float3 *throughput,
ccl_addr_space PathState *state,
- PathRadiance *L,
+ PathRadianceState *L_state,
ccl_addr_space Ray *ray)
{
/* sample phase function */
@@ -82,7 +77,7 @@ bool kernel_path_volume_bounce(
float3 phase_omega_in;
differential3 phase_domega_in;
float phase_u, phase_v;
- path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v);
+ path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
int label;
label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval,
@@ -92,7 +87,7 @@ bool kernel_path_volume_bounce(
return false;
/* modify throughput */
- path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label);
+ path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label);
/* set labels */
state->ray_pdf = phase_pdf;
@@ -120,7 +115,6 @@ bool kernel_path_volume_bounce(
#ifndef __SPLIT_KERNEL__
ccl_device void kernel_branched_path_volume_connect_light(
KernelGlobals *kg,
- RNG *rng,
ShaderData *sd,
ShaderData *emission_sd,
float3 throughput,
@@ -138,9 +132,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
BsdfEval L_light;
bool is_lamp;
-# ifdef __OBJECT_MOTION__
light_ray.time = sd->time;
-# endif
if(sample_all_lights) {
/* lamp sampling */
@@ -150,12 +142,12 @@ ccl_device void kernel_branched_path_volume_connect_light(
int num_samples = light_select_num_samples(kg, i);
float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights);
- RNG lamp_rng = cmj_hash(*rng, i);
+ uint lamp_rng_hash = cmj_hash(state->rng_hash, i);
for(int j = 0; j < num_samples; j++) {
/* sample random position on given light */
float light_u, light_v;
- path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+ path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
LightSample ls;
lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls);
@@ -163,26 +155,24 @@ ccl_device void kernel_branched_path_volume_connect_light(
float3 tp = throughput;
/* sample position on volume segment */
- float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
- float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+ float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
+ float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
- (void)result;
- kernel_assert(result == VOLUME_PATH_SCATTERED);
-
/* todo: split up light_sample so we don't have to call it again with new position */
- if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
+ if(result == VOLUME_PATH_SCATTERED &&
+ lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
if(kernel_data.integrator.pdf_triangles != 0.0f)
ls.pdf *= 2.0f;
- float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+ float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
/* trace shadow ray */
float3 shadow;
- if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+ if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
@@ -198,40 +188,37 @@ ccl_device void kernel_branched_path_volume_connect_light(
for(int j = 0; j < num_samples; j++) {
/* sample random position on random triangle */
- float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT);
float light_u, light_v;
- path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+ path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
/* only sample triangle lights */
if(kernel_data.integrator.num_all_lights)
- light_t = 0.5f*light_t;
+ light_u = 0.5f*light_u;
LightSample ls;
- light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
+ light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
float3 tp = throughput;
/* sample position on volume segment */
- float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
- float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+ float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
+ float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
- (void)result;
- kernel_assert(result == VOLUME_PATH_SCATTERED);
-
/* todo: split up light_sample so we don't have to call it again with new position */
- if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+ if(result == VOLUME_PATH_SCATTERED &&
+ light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
if(kernel_data.integrator.num_all_lights)
ls.pdf *= 2.0f;
- float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+ float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
/* trace shadow ray */
float3 shadow;
- if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+ if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
@@ -242,34 +229,31 @@ ccl_device void kernel_branched_path_volume_connect_light(
}
else {
/* sample random position on random light */
- float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
float light_u, light_v;
- path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+ path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
LightSample ls;
- light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
+ light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
float3 tp = throughput;
/* sample position on volume segment */
- float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
- float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+ float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
- (void)result;
- kernel_assert(result == VOLUME_PATH_SCATTERED);
-
/* todo: split up light_sample so we don't have to call it again with new position */
- if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+ if(result == VOLUME_PATH_SCATTERED &&
+ light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
/* sample random light */
- float terminate = path_state_rng_light_termination(kg, rng, state);
+ float terminate = path_state_rng_light_termination(kg, state);
if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
/* trace shadow ray */
float3 shadow;
- if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+ if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
}
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index e8a912ccc0b..11798d87cb5 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -18,55 +18,18 @@
CCL_NAMESPACE_BEGIN
-#ifdef __SOBOL__
-
-/* Skip initial numbers that are not as well distributed, especially the
- * first sequence is just 0 everywhere, which can be problematic for e.g.
- * path termination.
- */
-#define SOBOL_SKIP 64
+/* Pseudo random numbers, uncomment this for debugging correlations. Only run
+ * this single threaded on a CPU for repeatable resutls. */
+//#define __DEBUG_CORRELATION__
-/* High Dimensional Sobol. */
-/* Van der Corput radical inverse. */
-ccl_device uint van_der_corput(uint bits)
-{
- bits = (bits << 16) | (bits >> 16);
- bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8);
- bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4);
- bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2);
- bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1);
- return bits;
-}
-
-/* Sobol radical inverse. */
-ccl_device uint sobol(uint i)
-{
- uint r = 0;
- for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) {
- if(i & 1) {
- r ^= v;
- }
- }
- return r;
-}
+/* High Dimensional Sobol.
+ *
+ * Multidimensional sobol with generator matrices. Dimension 0 and 1 are equal
+ * to classic Van der Corput and Sobol sequences. */
-/* Inverse of sobol radical inverse. */
-ccl_device uint sobol_inverse(uint i)
-{
- const uint msb = 1U << 31;
- uint r = 0;
- for(uint v = 1; i; i <<= 1, v ^= v << 1) {
- if(i & msb) {
- r ^= v;
- }
- }
- return r;
-}
+#ifdef __SOBOL__
-/* Multidimensional sobol with generator matrices
- * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively.
- */
ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
{
uint result = 0;
@@ -79,51 +42,32 @@ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
return result;
}
-/* Lookup index and x/y coordinate, assumes m is a power of two. */
-ccl_device uint sobol_lookup(const uint m,
- const uint frame,
- const uint ex,
- const uint ey,
- uint *x, uint *y)
-{
- /* Shift is constant per frame. */
- const uint shift = frame << (m << 1);
- const uint sobol_shift = sobol(shift);
- /* Van der Corput is its own inverse. */
- const uint lower = van_der_corput(ex << (32 - m));
- /* Need to compensate for ey difference and shift. */
- const uint sobol_lower = sobol(lower);
- const uint mask = ~-(1 << m) << (32 - m); /* Only m upper bits. */
- const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask;
- /* Only use m upper bits for the index (m is a power of two). */
- const uint sobol_result = delta | (delta >> m);
- const uint upper = sobol_inverse(sobol_result);
- const uint index = shift | upper | lower;
- *x = van_der_corput(index);
- *y = sobol_shift ^ sobol_result ^ sobol_lower;
- return index;
-}
+#endif /* __SOBOL__ */
+
ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
- RNG *rng,
+ uint rng_hash,
int sample, int num_samples,
int dimension)
{
+#ifdef __DEBUG_CORRELATION__
+ return (float)drand48();
+#endif
+
#ifdef __CMJ__
- if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
+# ifdef __SOBOL__
+ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
+# endif
+ {
/* Correlated multi-jitter. */
- int p = *rng + dimension;
+ int p = rng_hash + dimension;
return cmj_sample_1D(sample, num_samples, p);
}
#endif
-#ifdef __SOBOL_FULL_SCREEN__
- uint result = sobol_dimension(kg, *rng, dimension);
- float r = (float)result * (1.0f/(float)0xFFFFFFFF);
- return r;
-#else
- /* Compute sobol sequence value using direction vectors. */
- uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension);
+#ifdef __SOBOL__
+ /* Sobol sequence value using direction vectors. */
+ uint result = sobol_dimension(kg, sample, dimension);
float r = (float)result * (1.0f/(float)0xFFFFFFFF);
/* Cranly-Patterson rotation using rng seed */
@@ -132,7 +76,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
/* Hash rng with dimension to solve correlation issues.
* See T38710, T50116.
*/
- RNG tmp_rng = cmj_hash_simple(dimension, *rng);
+ uint tmp_rng = cmj_hash_simple(dimension, rng_hash);
shift = tmp_rng * (1.0f/(float)0xFFFFFFFF);
return r + shift - floorf(r + shift);
@@ -140,128 +84,60 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
}
ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
- RNG *rng,
+ uint rng_hash,
int sample, int num_samples,
int dimension,
float *fx, float *fy)
{
+#ifdef __DEBUG_CORRELATION__
+ *fx = (float)drand48();
+ *fy = (float)drand48();
+ return;
+#endif
+
#ifdef __CMJ__
- if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
+# ifdef __SOBOL__
+ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
+# endif
+ {
/* Correlated multi-jitter. */
- int p = *rng + dimension;
+ int p = rng_hash + dimension;
cmj_sample_2D(sample, num_samples, p, fx, fy);
+ return;
}
- else
#endif
- {
- /* Sobol. */
- *fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
- *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
- }
+
+#ifdef __SOBOL__
+ /* Sobol. */
+ *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension);
+ *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1);
+#endif
}
ccl_device_inline void path_rng_init(KernelGlobals *kg,
ccl_global uint *rng_state,
int sample, int num_samples,
- RNG *rng,
+ uint *rng_hash,
int x, int y,
float *fx, float *fy)
{
-#ifdef __SOBOL_FULL_SCREEN__
- uint px, py;
- uint bits = 16; /* limits us to 65536x65536 and 65536 samples */
- uint size = 1 << bits;
- uint frame = sample;
-
- *rng = sobol_lookup(bits, frame, x, y, &px, &py);
-
- *rng ^= kernel_data.integrator.seed;
-
- if(sample == 0) {
- *fx = 0.5f;
- *fy = 0.5f;
- }
- else {
- *fx = size * (float)px * (1.0f/(float)0xFFFFFFFF) - x;
- *fy = size * (float)py * (1.0f/(float)0xFFFFFFFF) - y;
- }
-#else
- *rng = *rng_state;
-
- *rng ^= kernel_data.integrator.seed;
-
- if(sample == 0) {
- *fx = 0.5f;
- *fy = 0.5f;
- }
- else {
- path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
- }
-#endif
-}
-
-ccl_device void path_rng_end(KernelGlobals *kg,
- ccl_global uint *rng_state,
- RNG rng)
-{
- /* nothing to do */
-}
-
-#else /* __SOBOL__ */
-
-/* Linear Congruential Generator */
-
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
- RNG *rng,
- int sample, int num_samples,
- int dimension)
-{
- /* implicit mod 2^32 */
- *rng = (1103515245*(*rng) + 12345);
- return (float)*rng * (1.0f/(float)0xFFFFFFFF);
-}
-
-ccl_device_inline void path_rng_2D(KernelGlobals *kg,
- RNG *rng,
- int sample, int num_samples,
- int dimension,
- float *fx, float *fy)
-{
- *fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
- *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
-}
-
-ccl_device void path_rng_init(KernelGlobals *kg,
- ccl_global uint *rng_state,
- int sample, int num_samples,
- RNG *rng,
- int x, int y,
- float *fx, float *fy)
-{
/* load state */
- *rng = *rng_state;
+ *rng_hash = *rng_state;
+ *rng_hash ^= kernel_data.integrator.seed;
- *rng ^= kernel_data.integrator.seed;
+#ifdef __DEBUG_CORRELATION__
+ srand48(*rng_hash + sample);
+#endif
if(sample == 0) {
*fx = 0.5f;
*fy = 0.5f;
}
else {
- path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
+ path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy);
}
}
-ccl_device void path_rng_end(KernelGlobals *kg,
- ccl_global uint *rng_state,
- RNG rng)
-{
- /* store state for next sample */
- *rng_state = rng;
-}
-
-#endif /* __SOBOL__ */
-
/* Linear Congruential Generator */
ccl_device uint lcg_step_uint(uint *rng)
@@ -295,44 +171,22 @@ ccl_device uint lcg_init(uint seed)
*/
ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
- RNG *rng,
const ccl_addr_space PathState *state,
int dimension)
{
return path_rng_1D(kg,
- rng,
+ state->rng_hash,
state->sample, state->num_samples,
state->rng_offset + dimension);
}
-ccl_device_inline float path_state_rng_1D_for_decision(
- KernelGlobals *kg,
- RNG *rng,
- const ccl_addr_space PathState *state,
- int dimension)
-{
- /* The rng_offset is not increased for transparent bounces. if we do then
- * fully transparent objects can become subtly visible by the different
- * sampling patterns used where the transparent object is.
- *
- * however for some random numbers that will determine if we next bounce
- * is transparent we do need to increase the offset to avoid always making
- * the same decision. */
- const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
- return path_rng_1D(kg,
- rng,
- state->sample, state->num_samples,
- rng_offset + dimension);
-}
-
ccl_device_inline void path_state_rng_2D(KernelGlobals *kg,
- RNG *rng,
const ccl_addr_space PathState *state,
int dimension,
float *fx, float *fy)
{
path_rng_2D(kg,
- rng,
+ state->rng_hash,
state->sample, state->num_samples,
state->rng_offset + dimension,
fx, fy);
@@ -340,38 +194,22 @@ ccl_device_inline void path_state_rng_2D(KernelGlobals *kg,
ccl_device_inline float path_branched_rng_1D(
KernelGlobals *kg,
- RNG *rng,
+ uint rng_hash,
const ccl_addr_space PathState *state,
int branch,
int num_branches,
int dimension)
{
return path_rng_1D(kg,
- rng,
+ rng_hash,
state->sample * num_branches + branch,
state->num_samples * num_branches,
state->rng_offset + dimension);
}
-ccl_device_inline float path_branched_rng_1D_for_decision(
- KernelGlobals *kg,
- RNG *rng,
- const ccl_addr_space PathState *state,
- int branch,
- int num_branches,
- int dimension)
-{
- const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
- return path_rng_1D(kg,
- rng,
- state->sample * num_branches + branch,
- state->num_samples * num_branches,
- rng_offset + dimension);
-}
-
ccl_device_inline void path_branched_rng_2D(
KernelGlobals *kg,
- RNG *rng,
+ uint rng_hash,
const ccl_addr_space PathState *state,
int branch,
int num_branches,
@@ -379,7 +217,7 @@ ccl_device_inline void path_branched_rng_2D(
float *fx, float *fy)
{
path_rng_2D(kg,
- rng,
+ rng_hash,
state->sample * num_branches + branch,
state->num_samples * num_branches,
state->rng_offset + dimension,
@@ -391,52 +229,45 @@ ccl_device_inline void path_branched_rng_2D(
*/
ccl_device_inline float path_state_rng_light_termination(
KernelGlobals *kg,
- RNG *rng,
const ccl_addr_space PathState *state)
{
if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
- return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
+ return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
}
return 0.0f;
}
ccl_device_inline float path_branched_rng_light_termination(
KernelGlobals *kg,
- RNG *rng,
+ uint rng_hash,
const ccl_addr_space PathState *state,
int branch,
int num_branches)
{
if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
- return path_branched_rng_1D_for_decision(kg,
- rng,
- state,
- branch,
- num_branches,
- PRNG_LIGHT_TERMINATE);
+ return path_branched_rng_1D(kg,
+ rng_hash,
+ state,
+ branch,
+ num_branches,
+ PRNG_LIGHT_TERMINATE);
}
return 0.0f;
}
-ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
- int branch,
- int num_branches)
+ccl_device_inline uint lcg_state_init(PathState *state,
+ uint scramble)
{
- /* path is splitting into a branch, adjust so that each branch
- * still gets a unique sample from the same sequence */
- state->rng_offset += PRNG_BOUNCE_NUM;
- state->sample = state->sample*num_branches + branch;
- state->num_samples = state->num_samples*num_branches;
+ return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble);
}
-ccl_device_inline uint lcg_state_init(RNG *rng,
- int rng_offset,
- int sample,
- uint scramble)
+ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state,
+ uint scramble)
{
- return lcg_init(*rng + rng_offset + sample*scramble);
+ return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble);
}
+
ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
{
/* Implicit mod 2^32 */
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index c66f52255f0..eeb4eb0097f 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -66,8 +66,8 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
/* matrices and time */
#ifdef __OBJECT_MOTION__
shader_setup_object_transforms(kg, sd, ray->time);
- sd->time = ray->time;
#endif
+ sd->time = ray->time;
sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
sd->ray_length = isect->t;
@@ -83,7 +83,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
sd->shader = __float_as_int(curvedata.z);
- sd->P = bvh_curve_refine(kg, sd, isect, ray);
+ sd->P = curve_refine(kg, sd, isect, ray);
}
else
#endif
@@ -271,17 +271,17 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
sd->u = u;
sd->v = v;
#endif
+ sd->time = time;
sd->ray_length = t;
sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
sd->object_flag = 0;
if(sd->object != OBJECT_NONE) {
sd->object_flag |= kernel_tex_fetch(__object_flag,
- sd->object);
+ sd->object);
#ifdef __OBJECT_MOTION__
shader_setup_object_transforms(kg, sd, time);
- sd->time = time;
}
else if(lamp != LAMP_NONE) {
sd->ob_tfm = lamp_fetch_transform(kg, lamp, false);
@@ -385,9 +385,7 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
sd->shader = kernel_data.background.surface_shader;
sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
sd->object_flag = 0;
-#ifdef __OBJECT_MOTION__
sd->time = ray->time;
-#endif
sd->ray_length = 0.0f;
#ifdef __INSTANCING__
@@ -427,9 +425,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
sd->shader = SHADER_NONE;
sd->flag = 0;
sd->object_flag = 0;
-#ifdef __OBJECT_MOTION__
sd->time = ray->time;
-#endif
sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
#ifdef __INSTANCING__
@@ -498,20 +494,45 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd)
}
#endif
+/* Defensive sampling. */
+
+ccl_device_inline void shader_prepare_closures(ShaderData *sd,
+ ccl_addr_space PathState *state)
+{
+ /* We can likely also do defensive sampling at deeper bounces, particularly
+ * for cases like a perfect mirror but possibly also others. This will need
+ * a good heuristic. */
+ if(state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) {
+ float sum = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+ if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+ sum += sc->sample_weight;
+ }
+ }
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+ if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+ sc->sample_weight = max(sc->sample_weight, 0.125f * sum);
+ }
+ }
+ }
+}
+
+
/* BSDF */
ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, float *pdf,
- int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
+ const ShaderClosure *skip_sc, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
{
/* this is the veach one-sample model with balance heuristic, some pdf
* factors drop out when using balance heuristic weighting */
for(int i = 0; i < sd->num_closure; i++) {
- if(i == skip_bsdf)
- continue;
-
const ShaderClosure *sc = &sd->closure[i];
- if(CLOSURE_IS_BSDF(sc->type)) {
+ if(sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) {
float bsdf_pdf = 0.0f;
float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
@@ -574,7 +595,7 @@ void shader_bsdf_eval(KernelGlobals *kg,
#endif
{
float pdf;
- _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f);
+ _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f);
if(use_mis) {
float weight = power_heuristic(light_pdf, pdf);
bsdf_eval_mis(eval, weight);
@@ -582,48 +603,120 @@ void shader_bsdf_eval(KernelGlobals *kg,
}
}
-ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
- ShaderData *sd,
- float randu, float randv,
- BsdfEval *bsdf_eval,
- float3 *omega_in,
- differential3 *domega_in,
- float *pdf)
+ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd,
+ float *randu)
{
int sampled = 0;
if(sd->num_closure > 1) {
- /* pick a BSDF closure based on sample weights */
+ /* Pick a BSDF or based on sample weights. */
float sum = 0.0f;
- for(sampled = 0; sampled < sd->num_closure; sampled++) {
- const ShaderClosure *sc = &sd->closure[sampled];
-
- if(CLOSURE_IS_BSDF(sc->type))
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ if(CLOSURE_IS_BSDF(sc->type)) {
sum += sc->sample_weight;
+ }
}
- float r = sd->randb_closure*sum;
- sum = 0.0f;
+ float r = (*randu)*sum;
+ float partial_sum = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
- for(sampled = 0; sampled < sd->num_closure; sampled++) {
- const ShaderClosure *sc = &sd->closure[sampled];
-
if(CLOSURE_IS_BSDF(sc->type)) {
- sum += sc->sample_weight;
+ float next_sum = partial_sum + sc->sample_weight;
+
+ if(r < next_sum) {
+ sampled = i;
- if(r <= sum)
+ /* Rescale to reuse for direction sample, to better
+ * preserve stratifaction. */
+ *randu = (r - partial_sum) / sc->sample_weight;
break;
+ }
+
+ partial_sum = next_sum;
}
}
+ }
- if(sampled == sd->num_closure) {
- *pdf = 0.0f;
- return LABEL_NONE;
+ return &sd->closure[sampled];
+}
+
+ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
+ ccl_addr_space float3 *throughput,
+ float *randu)
+{
+ int sampled = 0;
+
+ if(sd->num_closure > 1) {
+ /* Pick a BSDF or BSSRDF or based on sample weights. */
+ float sum_bsdf = 0.0f;
+ float sum_bssrdf = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ if(CLOSURE_IS_BSDF(sc->type)) {
+ sum_bsdf += sc->sample_weight;
+ }
+ else if(CLOSURE_IS_BSSRDF(sc->type)) {
+ sum_bssrdf += sc->sample_weight;
+ }
+ }
+
+ float r = (*randu)*(sum_bsdf + sum_bssrdf);
+ float partial_sum = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+ float next_sum = partial_sum + sc->sample_weight;
+
+ if(r < next_sum) {
+ if(CLOSURE_IS_BSDF(sc->type)) {
+ *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf;
+ return NULL;
+ }
+ else {
+ *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
+ sampled = i;
+
+ /* Rescale to reuse for direction sample, to better
+ * preserve stratifaction. */
+ *randu = (r - partial_sum) / sc->sample_weight;
+ break;
+ }
+ }
+
+ partial_sum = next_sum;
+ }
}
}
- const ShaderClosure *sc = &sd->closure[sampled];
+ return &sd->closure[sampled];
+}
+
+ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
+ ShaderData *sd,
+ float randu, float randv,
+ BsdfEval *bsdf_eval,
+ float3 *omega_in,
+ differential3 *domega_in,
+ float *pdf)
+{
+ const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
+ if(sc == NULL) {
+ *pdf = 0.0f;
+ return LABEL_NONE;
+ }
+
+ /* BSSRDF should already have been handled elsewhere. */
+ kernel_assert(CLOSURE_IS_BSDF(sc->type));
int label;
float3 eval;
@@ -636,7 +729,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
if(sd->num_closure > 1) {
float sweight = sc->sample_weight;
- _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
+ _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf*sweight, sweight);
}
}
@@ -669,7 +762,7 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
}
}
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd)
{
if(sd->flag & SD_HAS_ONLY_VOLUME)
return make_float3(1.0f, 1.0f, 1.0f);
@@ -677,7 +770,7 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
for(int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ const ShaderClosure *sc = &sd->closure[i];
if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
eval += sc->weight;
@@ -764,6 +857,19 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
return eval;
}
+ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
+{
+ float3 N = make_float3(0.0f, 0.0f, 0.0f);
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+ if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+ N += sc->N*average(sc->weight);
+ }
+
+ return (is_zero(N))? sd->N : normalize(N);
+}
+
ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -783,12 +889,7 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
}
}
- if(is_zero(N))
- N = sd->N;
- else
- N = normalize(N);
-
- *N_ = N;
+ *N_ = (is_zero(N))? sd->N : normalize(N);
return eval;
}
@@ -863,16 +964,15 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
/* Surface Evaluation */
-ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng,
- ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx)
+ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
+ ccl_addr_space PathState *state, int path_flag)
{
sd->num_closure = 0;
sd->num_closure_extra = 0;
- sd->randb_closure = randb;
#ifdef __OSL__
if(kg->osl)
- OSLShader::eval_surface(kg, sd, state, path_flag, ctx);
+ OSLShader::eval_surface(kg, sd, state, path_flag);
else
#endif
{
@@ -887,24 +987,23 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng,
#endif
}
- if(rng && (sd->flag & SD_BSDF_NEEDS_LCG)) {
- sd->lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0xb4bc3953);
+ if(sd->flag & SD_BSDF_NEEDS_LCG) {
+ sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953);
}
}
/* Background Evaluation */
ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
- ccl_addr_space PathState *state, int path_flag, ShaderContext ctx)
+ ccl_addr_space PathState *state, int path_flag)
{
sd->num_closure = 0;
sd->num_closure_extra = 0;
- sd->randb_closure = 0.0f;
#ifdef __SVM__
#ifdef __OSL__
if(kg->osl) {
- OSLShader::eval_background(kg, sd, state, path_flag, ctx);
+ OSLShader::eval_background(kg, sd, state, path_flag);
}
else
#endif
@@ -981,17 +1080,22 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s
sum += sc->sample_weight;
}
- float r = sd->randb_closure*sum;
- sum = 0.0f;
+ float r = randu*sum;
+ float partial_sum = 0.0f;
for(sampled = 0; sampled < sd->num_closure; sampled++) {
const ShaderClosure *sc = &sd->closure[sampled];
if(CLOSURE_IS_PHASE(sc->type)) {
- sum += sc->sample_weight;
+ float next_sum = partial_sum + sc->sample_weight;
- if(r <= sum)
+ if(r <= next_sum) {
+ /* Rescale to reuse for BSDF direction sample. */
+ randu = (r - partial_sum) / sc->sample_weight;
break;
+ }
+
+ partial_sum = next_sum;
}
}
@@ -1039,8 +1143,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
ShaderData *sd,
ccl_addr_space PathState *state,
ccl_addr_space VolumeStack *stack,
- int path_flag,
- ShaderContext ctx)
+ int path_flag)
{
/* reset closures once at the start, we will be accumulating the closures
* for all volumes in the stack into a single array of closures */
@@ -1073,7 +1176,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
#ifdef __SVM__
# ifdef __OSL__
if(kg->osl) {
- OSLShader::eval_volume(kg, sd, state, path_flag, ctx);
+ OSLShader::eval_volume(kg, sd, state, path_flag);
}
else
# endif
@@ -1092,17 +1195,16 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
/* Displacement Evaluation */
-ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx)
+ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state)
{
sd->num_closure = 0;
sd->num_closure_extra = 0;
- sd->randb_closure = 0.0f;
/* this will modify sd->P */
#ifdef __SVM__
# ifdef __OSL__
if(kg->osl)
- OSLShader::eval_displacement(kg, sd, ctx);
+ OSLShader::eval_displacement(kg, sd);
else
# endif
{
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index fab5946970d..8a0da6c3b13 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -16,6 +16,42 @@
CCL_NAMESPACE_BEGIN
+#ifdef __VOLUME__
+typedef struct VolumeState {
+# ifdef __SPLIT_KERNEL__
+# else
+ PathState ps;
+# endif
+} VolumeState;
+
+/* Get PathState ready for use for volume stack evaluation. */
+# ifdef __SPLIT_KERNEL__
+ccl_addr_space
+# endif
+ccl_device_inline PathState *shadow_blocked_volume_path_state(
+ KernelGlobals *kg,
+ VolumeState *volume_state,
+ ccl_addr_space PathState *state,
+ ShaderData *sd,
+ Ray *ray)
+{
+# ifdef __SPLIT_KERNEL__
+ ccl_addr_space PathState *ps =
+ &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+# else
+ PathState *ps = &volume_state->ps;
+# endif
+ *ps = *state;
+ /* We are checking for shadow on the "other" side of the surface, so need
+ * to discard volume we are currently at.
+ */
+ if(dot(sd->Ng, ray->D) < 0.0f) {
+ kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack);
+ }
+ return ps;
+}
+#endif /* __VOLUME__ */
+
/* Attenuate throughput accordingly to the given intersection event.
* Returns true if the throughput is zero and traversal can be aborted.
*/
@@ -49,11 +85,8 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
path_state_modify_bounce(state, true);
shader_eval_surface(kg,
shadow_sd,
- NULL,
state,
- 0.0f,
- PATH_RAY_SHADOW,
- SHADER_CONTEXT_SHADOW);
+ PATH_RAY_SHADOW);
path_state_modify_bounce(state, false);
*throughput *= shader_bsdf_transparency(kg, shadow_sd);
}
@@ -72,13 +105,14 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
ShaderData *shadow_sd,
ccl_addr_space PathState *state,
+ const uint visibility,
Ray *ray,
Intersection *isect,
float3 *shadow)
{
const bool blocked = scene_intersect(kg,
*ray,
- PATH_RAY_SHADOW_OPAQUE,
+ visibility & PATH_RAY_SHADOW_OPAQUE,
isect,
NULL,
0.0f, 0.0f);
@@ -126,9 +160,10 @@ ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
* Note that hits array should be as big as max_hits+1.
*/
ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
+ ShaderData *sd,
ShaderData *shadow_sd,
ccl_addr_space PathState *state,
- const int skip_object,
+ const uint visibility,
Ray *ray,
Intersection *hits,
uint max_hits,
@@ -141,9 +176,12 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
const bool blocked = scene_intersect_shadow_all(kg,
ray,
hits,
- skip_object,
+ visibility,
max_hits,
&num_hits);
+# ifdef __VOLUME__
+ VolumeState volume_state;
+# endif
/* If no opaque surface found but we did find transparent hits,
* shade them.
*/
@@ -155,12 +193,13 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
Intersection *isect = hits;
# ifdef __VOLUME__
# ifdef __SPLIT_KERNEL__
- ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
-# else
- PathState ps_object;
- PathState *ps = &ps_object;
+ ccl_addr_space
# endif
- *ps = *state;
+ PathState *ps = shadow_blocked_volume_path_state(kg,
+ &volume_state,
+ state,
+ sd,
+ ray);
# endif
sort_intersections(hits, num_hits);
for(int hit = 0; hit < num_hits; hit++, isect++) {
@@ -205,8 +244,16 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
}
# ifdef __VOLUME__
if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
- /* Apply attenuation from current volume shader/ */
- kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+ /* Apply attenuation from current volume shader. */
+# ifdef __SPLIT_KERNEL__
+ ccl_addr_space
+# endif
+ PathState *ps = shadow_blocked_volume_path_state(kg,
+ &volume_state,
+ state,
+ sd,
+ ray);
+ kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
}
# endif
return blocked;
@@ -216,9 +263,10 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
* loop to help readability of the actual logic.
*/
ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
+ ShaderData *sd,
ShaderData *shadow_sd,
ccl_addr_space PathState *state,
- const int skip_object,
+ const uint visibility,
Ray *ray,
uint max_hits,
float3 *shadow)
@@ -251,9 +299,10 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
# endif /* __KERNEL_GPU__ */
/* Invoke actual traversal. */
return shadow_blocked_transparent_all_loop(kg,
+ sd,
shadow_sd,
state,
- skip_object,
+ visibility,
ray,
hits,
max_hits,
@@ -276,27 +325,32 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
*/
ccl_device bool shadow_blocked_transparent_stepped_loop(
KernelGlobals *kg,
+ ShaderData *sd,
ShaderData *shadow_sd,
ccl_addr_space PathState *state,
- const int skip_object,
+ const uint visibility,
Ray *ray,
Intersection *isect,
const bool blocked,
const bool is_transparent_isect,
float3 *shadow)
{
- if((blocked && is_transparent_isect) || skip_object != OBJECT_NONE) {
+# ifdef __VOLUME__
+ VolumeState volume_state;
+# endif
+ if(blocked && is_transparent_isect) {
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float3 Pend = ray->P + ray->D*ray->t;
int bounce = state->transparent_bounce;
# ifdef __VOLUME__
# ifdef __SPLIT_KERNEL__
- ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
-# else
- PathState ps_object;
- PathState *ps = &ps_object;
+ ccl_addr_space
# endif
- *ps = *state;
+ PathState *ps = shadow_blocked_volume_path_state(kg,
+ &volume_state,
+ state,
+ sd,
+ ray);
# endif
for(;;) {
if(bounce >= kernel_data.integrator.transparent_max_bounce) {
@@ -304,30 +358,13 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
}
if(!scene_intersect(kg,
*ray,
- PATH_RAY_SHADOW_TRANSPARENT,
+ visibility & PATH_RAY_SHADOW_TRANSPARENT,
isect,
NULL,
0.0f, 0.0f))
{
break;
}
-#ifdef __SHADOW_TRICKS__
- if(skip_object != OBJECT_NONE) {
- const int isect_object = (isect->object == PRIM_NONE)
- ? kernel_tex_fetch(__prim_object, isect->prim)
- : isect->object;
- if(isect_object == skip_object) {
- shader_setup_from_ray(kg, shadow_sd, isect, ray);
- /* Move ray forward. */
- ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
- if(ray->t != FLT_MAX) {
- ray->D = normalize_len(Pend - ray->P, &ray->t);
- }
- bounce++;
- continue;
- }
- }
-#endif
if(!shader_transparent_shadow(kg, isect)) {
return true;
}
@@ -363,7 +400,15 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
# ifdef __VOLUME__
if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
/* Apply attenuation from current volume shader. */
- kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+# ifdef __SPLIT_KERNEL__
+ ccl_addr_space
+# endif
+ PathState *ps = shadow_blocked_volume_path_state(kg,
+ &volume_state,
+ state,
+ sd,
+ ray);
+ kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
}
# endif
return blocked;
@@ -371,33 +416,28 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
ccl_device bool shadow_blocked_transparent_stepped(
KernelGlobals *kg,
+ ShaderData *sd,
ShaderData *shadow_sd,
ccl_addr_space PathState *state,
- const int skip_object,
+ const uint visibility,
Ray *ray,
Intersection *isect,
float3 *shadow)
{
- bool blocked, is_transparent_isect;
- if(skip_object == OBJECT_NONE) {
- blocked = scene_intersect(kg,
- *ray,
- PATH_RAY_SHADOW_OPAQUE,
- isect,
- NULL,
- 0.0f, 0.0f);
- is_transparent_isect = blocked
- ? shader_transparent_shadow(kg, isect)
- : false;
- }
- else {
- blocked = false;
- is_transparent_isect = false;
- }
+ bool blocked = scene_intersect(kg,
+ *ray,
+ visibility & PATH_RAY_SHADOW_OPAQUE,
+ isect,
+ NULL,
+ 0.0f, 0.0f);
+ bool is_transparent_isect = blocked
+ ? shader_transparent_shadow(kg, isect)
+ : false;
return shadow_blocked_transparent_stepped_loop(kg,
+ sd,
shadow_sd,
state,
- skip_object,
+ visibility,
ray,
isect,
blocked,
@@ -409,6 +449,7 @@ ccl_device bool shadow_blocked_transparent_stepped(
#endif /* __TRANSPARENT_SHADOWS__ */
ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
+ ShaderData *sd,
ShaderData *shadow_sd,
ccl_addr_space PathState *state,
Ray *ray_input,
@@ -422,25 +463,24 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
return false;
}
#ifdef __SHADOW_TRICKS__
- const int skip_object = state->catcher_object;
+ const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER)
+ ? PATH_RAY_SHADOW_NON_CATCHER
+ : PATH_RAY_SHADOW;
#else
- const int skip_object = OBJECT_NONE;
+ const uint visibility = PATH_RAY_SHADOW;
#endif
/* Do actual shadow shading. */
/* First of all, we check if integrator requires transparent shadows.
* if not, we use simplest and fastest ever way to calculate occlusion.
- *
- * NOTE: We can't do quick opaque test here if we are on shadow-catcher
- * path because we don't want catcher object to be casting shadow here.
*/
#ifdef __TRANSPARENT_SHADOWS__
- if(!kernel_data.integrator.transparent_shadows &&
- skip_object == OBJECT_NONE)
+ if(!kernel_data.integrator.transparent_shadows)
#endif
{
return shadow_blocked_opaque(kg,
shadow_sd,
state,
+ visibility,
ray,
&isect,
shadow);
@@ -467,7 +507,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
*/
const bool blocked = scene_intersect(kg,
*ray,
- PATH_RAY_SHADOW_OPAQUE,
+ visibility & PATH_RAY_SHADOW_OPAQUE,
&isect,
NULL,
0.0f, 0.0f);
@@ -478,9 +518,10 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
max_hits + 1 >= SHADOW_STACK_MAX_HITS)
{
return shadow_blocked_transparent_stepped_loop(kg,
+ sd,
shadow_sd,
state,
- skip_object,
+ visibility,
ray,
&isect,
blocked,
@@ -489,18 +530,20 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
}
# endif /* __KERNEL_GPU__ */
return shadow_blocked_transparent_all(kg,
+ sd,
shadow_sd,
state,
- skip_object,
+ visibility,
ray,
max_hits,
shadow);
# else /* __SHADOW_RECORD_ALL__ */
/* Fallback to a slowest version which works on all devices. */
return shadow_blocked_transparent_stepped(kg,
+ sd,
shadow_sd,
state,
- skip_object,
+ visibility,
ray,
&isect,
shadow);
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 6475d4b66fd..23a09e5e2ca 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -28,87 +28,31 @@ CCL_NAMESPACE_BEGIN
* - try to reduce one sample model variance
*/
-#define BSSRDF_MULTI_EVAL
-
-ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability)
-{
- /* sum sample weights of bssrdf and bsdf */
- float bsdf_sum = 0.0f;
- float bssrdf_sum = 0.0f;
-
- for(int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
-
- if(CLOSURE_IS_BSDF(sc->type))
- bsdf_sum += sc->sample_weight;
- else if(CLOSURE_IS_BSSRDF(sc->type))
- bssrdf_sum += sc->sample_weight;
- }
-
- /* use bsdf or bssrdf? */
- float r = sd->randb_closure*(bsdf_sum + bssrdf_sum);
-
- if(r < bsdf_sum) {
- /* use bsdf, and adjust randb so we can reuse it for picking a bsdf */
- sd->randb_closure = r/bsdf_sum;
- *probability = (bsdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bsdf_sum: 1.0f;
- return NULL;
- }
-
- /* use bssrdf */
- r -= bsdf_sum;
-
- float sum = 0.0f;
-
- for(int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
-
- if(CLOSURE_IS_BSSRDF(sc->type)) {
- sum += sc->sample_weight;
-
- if(r <= sum) {
- sd->randb_closure = (r - (sum - sc->sample_weight))/sc->sample_weight;
-
-#ifdef BSSRDF_MULTI_EVAL
- *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bssrdf_sum: 1.0f;
-#else
- *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/sc->sample_weight: 1.0f;
-#endif
- return sc;
- }
- }
- }
-
- /* should never happen */
- sd->randb_closure = 0.0f;
- *probability = 1.0f;
- return NULL;
-}
-
ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
- ShaderClosure *sc,
+ const ShaderClosure *sc,
float disk_r,
float r,
bool all)
{
-#ifdef BSSRDF_MULTI_EVAL
/* this is the veach one-sample model with balance heuristic, some pdf
* factors drop out when using balance heuristic weighting */
float3 eval_sum = make_float3(0.0f, 0.0f, 0.0f);
float pdf_sum = 0.0f;
- float sample_weight_sum = 0.0f;
- int num_bssrdf = 0;
+ float sample_weight_inv = 0.0f;
- for(int i = 0; i < sd->num_closure; i++) {
- sc = &sd->closure[i];
-
- if(CLOSURE_IS_BSSRDF(sc->type)) {
- float sample_weight = (all)? 1.0f: sc->sample_weight;
- sample_weight_sum += sample_weight;
+ if(!all) {
+ float sample_weight_sum = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ sc = &sd->closure[i];
+
+ if(CLOSURE_IS_BSSRDF(sc->type)) {
+ sample_weight_sum += sc->sample_weight;
+ }
}
- }
- float sample_weight_inv = 1.0f/sample_weight_sum;
+ sample_weight_inv = 1.0f/sample_weight_sum;
+ }
for(int i = 0; i < sd->num_closure; i++) {
sc = &sd->closure[i];
@@ -125,25 +69,16 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
/* TODO power heuristic is not working correct here */
eval_sum += sc->weight*pdf; //*sample_weight*disk_pdf;
pdf_sum += sample_weight*disk_pdf; //*sample_weight*disk_pdf;
-
- num_bssrdf++;
}
}
return (pdf_sum > 0.0f)? eval_sum / pdf_sum : make_float3(0.0f, 0.0f, 0.0f);
-#else
- float pdf = bssrdf_pdf(pick_sc, r);
- float disk_pdf = bssrdf_pdf(pick_sc, disk_r);
-
- return pick_sc->weight * pdf / disk_pdf;
-#endif
}
/* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, ShaderClosure *sc, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, const ShaderClosure *sc, float3 weight, bool hit, float3 N)
{
sd->flag &= ~SD_CLOSURE_FLAGS;
- sd->randb_closure = 0.0f;
sd->num_closure = 0;
sd->num_closure_extra = 0;
@@ -219,7 +154,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
if(bump || texture_blur > 0.0f) {
/* average color and normal at incoming point */
- shader_eval_surface(kg, sd, NULL, state, 0.0f, state_flag, SHADER_CONTEXT_SSS);
+ shader_eval_surface(kg, sd, state, state_flag);
float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL);
/* we simply divide out the average color and multiply with the average
@@ -242,8 +177,8 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
KernelGlobals *kg,
SubsurfaceIntersection *ss_isect,
ShaderData *sd,
- ShaderClosure *sc,
- RNG *lcg_state,
+ const ShaderClosure *sc,
+ uint *lcg_state,
float disk_u,
float disk_v,
bool all)
@@ -255,26 +190,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
disk_N = sd->Ng;
make_orthonormals(disk_N, &disk_T, &disk_B);
- /* reusing variable for picking the closure gives a bit nicer stratification
- * for path tracer, for branched we do all closures so it doesn't help */
- float axisu = (all)? disk_u: sd->randb_closure;
-
- if(axisu < 0.5f) {
+ if(disk_u < 0.5f) {
pick_pdf_N = 0.5f;
pick_pdf_T = 0.25f;
pick_pdf_B = 0.25f;
- if(all)
- disk_u *= 2.0f;
+ disk_u *= 2.0f;
}
- else if(axisu < 0.75f) {
+ else if(disk_u < 0.75f) {
float3 tmp = disk_N;
disk_N = disk_T;
disk_T = tmp;
pick_pdf_N = 0.25f;
pick_pdf_T = 0.5f;
pick_pdf_B = 0.25f;
- if(all)
- disk_u = (disk_u - 0.5f)*4.0f;
+ disk_u = (disk_u - 0.5f)*4.0f;
}
else {
float3 tmp = disk_N;
@@ -283,8 +212,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
pick_pdf_N = 0.25f;
pick_pdf_T = 0.25f;
pick_pdf_B = 0.5f;
- if(all)
- disk_u = (disk_u - 0.75f)*4.0f;
+ disk_u = (disk_u - 0.75f)*4.0f;
}
/* sample point on disk */
@@ -390,7 +318,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
ShaderData *sd,
ccl_addr_space PathState *state,
int state_flag,
- ShaderClosure *sc,
+ const ShaderClosure *sc,
bool all)
{
#ifdef __SPLIT_KERNEL__
@@ -419,7 +347,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
/* subsurface scattering step, from a point on the surface to another nearby point on the same object */
ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state,
- int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
+ int state_flag, const ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -430,18 +358,20 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a
disk_N = sd->Ng;
make_orthonormals(disk_N, &disk_T, &disk_B);
- if(sd->randb_closure < 0.5f) {
+ if(disk_u < 0.5f) {
pick_pdf_N = 0.5f;
pick_pdf_T = 0.25f;
pick_pdf_B = 0.25f;
+ disk_u *= 2.0f;
}
- else if(sd->randb_closure < 0.75f) {
+ else if(disk_u < 0.75f) {
float3 tmp = disk_N;
disk_N = disk_T;
disk_T = tmp;
pick_pdf_N = 0.25f;
pick_pdf_T = 0.5f;
pick_pdf_B = 0.25f;
+ disk_u = (disk_u - 0.5f)*4.0f;
}
else {
float3 tmp = disk_N;
@@ -450,6 +380,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a
pick_pdf_N = 0.25f;
pick_pdf_T = 0.25f;
pick_pdf_B = 0.5f;
+ disk_u = (disk_u - 0.75f)*4.0f;
}
/* sample point on disk */
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index cb1a3f40dee..5eab28a2953 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -82,115 +82,110 @@ KERNEL_TEX(uint, texture_uint, __sobol_directions)
# if __CUDA_ARCH__ < 300
/* full-float image */
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032)
KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004)
-
-/* image */
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_008)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_016)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_024)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_032)
+
+/* image
+ * These texture names are encoded to their flattened slots as
+ * ImageManager::type_index_to_flattened_slot() returns them. */
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665)
# else
/* bindless textures */
KERNEL_TEX(uint, texture_uint, __bindless_mapping)
-# endif
-#endif
-
-/* packed image (opencl) */
-KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
-KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed)
-KERNEL_TEX(uchar, texture_uchar, __tex_image_byte_packed)
-KERNEL_TEX(float, texture_float, __tex_image_float_packed)
-KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info)
+# endif /* __CUDA_ARCH__ */
+#endif /* __KERNEL_CUDA__ */
#undef KERNEL_TEX
#undef KERNEL_IMAGE_TEX
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 34affab1b9d..6c5b6ca3b2d 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -130,6 +130,7 @@ CCL_NAMESPACE_BEGIN
# ifdef __KERNEL_OPENCL_APPLE__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
+# define __PRINCIPLED__
# define __CMJ__
/* TODO(sergey): Currently experimental section is ignored here,
* this is because megakernel in device_opencl does not support
@@ -154,6 +155,7 @@ CCL_NAMESPACE_BEGIN
# define __CL_USE_NATIVE__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
+# define __PRINCIPLED__
# define __CMJ__
# endif /* __KERNEL_OPENCL_INTEL_CPU__ */
@@ -240,10 +242,6 @@ CCL_NAMESPACE_BEGIN
# undef __DENOISING_FEATURES__
#endif
-/* Random Numbers */
-
-typedef uint RNG;
-
/* Shader Evaluation */
typedef enum ShaderEvalType {
@@ -283,31 +281,21 @@ enum PathTraceDimension {
PRNG_FILTER_V = 1,
PRNG_LENS_U = 2,
PRNG_LENS_V = 3,
-#ifdef __CAMERA_MOTION__
PRNG_TIME = 4,
PRNG_UNUSED_0 = 5,
PRNG_UNUSED_1 = 6, /* for some reason (6, 7) is a bad sobol pattern */
PRNG_UNUSED_2 = 7, /* with a low number of samples (< 64) */
-#endif
- PRNG_BASE_NUM = 8,
+ PRNG_BASE_NUM = 10,
PRNG_BSDF_U = 0,
PRNG_BSDF_V = 1,
- PRNG_BSDF = 2,
- PRNG_LIGHT = 3,
- PRNG_LIGHT_U = 4,
- PRNG_LIGHT_V = 5,
- PRNG_LIGHT_TERMINATE = 6,
- PRNG_TERMINATE = 7,
-
-#ifdef __VOLUME__
- PRNG_PHASE_U = 8,
- PRNG_PHASE_V = 9,
- PRNG_PHASE = 10,
- PRNG_SCATTER_DISTANCE = 11,
-#endif
-
- PRNG_BOUNCE_NUM = 12,
+ PRNG_LIGHT_U = 2,
+ PRNG_LIGHT_V = 3,
+ PRNG_LIGHT_TERMINATE = 4,
+ PRNG_TERMINATE = 5,
+ PRNG_PHASE_CHANNEL = 6,
+ PRNG_SCATTER_DISTANCE = 7,
+ PRNG_BOUNCE_NUM = 8,
};
enum SamplingPattern {
@@ -328,24 +316,28 @@ enum PathRayFlag {
PATH_RAY_SINGULAR = (1 << 5),
PATH_RAY_TRANSPARENT = (1 << 6),
- PATH_RAY_SHADOW_OPAQUE = (1 << 7),
- PATH_RAY_SHADOW_TRANSPARENT = (1 << 8),
- PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
+ PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7),
+ PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8),
+ PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_OPAQUE_CATCHER),
+ PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9),
+ PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10),
+ PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_CATCHER),
+ PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER),
+ PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
- PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */
- PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */
+ PATH_RAY_CURVE = (1 << 11), /* visibility flag to define curve segments */
+ PATH_RAY_VOLUME_SCATTER = (1 << 12), /* volume scattering */
/* Special flag to tag unaligned BVH nodes. */
- PATH_RAY_NODE_UNALIGNED = (1 << 11),
+ PATH_RAY_NODE_UNALIGNED = (1 << 13),
- PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
+ PATH_RAY_ALL_VISIBILITY = ((1 << 14)-1),
- PATH_RAY_MIS_SKIP = (1 << 12),
- PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13),
- PATH_RAY_SINGLE_PASS_DONE = (1 << 14),
- PATH_RAY_SHADOW_CATCHER = (1 << 15),
- PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
- PATH_RAY_STORE_SHADOW_INFO = (1 << 17),
+ PATH_RAY_MIS_SKIP = (1 << 15),
+ PATH_RAY_DIFFUSE_ANCESTOR = (1 << 16),
+ PATH_RAY_SINGLE_PASS_DONE = (1 << 17),
+ PATH_RAY_SHADOW_CATCHER = (1 << 18),
+ PATH_RAY_STORE_SHADOW_INFO = (1 << 19),
};
/* Closure Label */
@@ -462,18 +454,42 @@ typedef enum DenoiseFlag {
DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1,
} DenoiseFlag;
+#ifdef __KERNEL_DEBUG__
+/* NOTE: This is a runtime-only struct, alignment is not
+ * really important here.
+ */
+typedef struct DebugData {
+ int num_bvh_traversed_nodes;
+ int num_bvh_traversed_instances;
+ int num_bvh_intersections;
+ int num_ray_bounces;
+} DebugData;
+#endif
+
+typedef ccl_addr_space struct PathRadianceState {
+#ifdef __PASSES__
+ float3 diffuse;
+ float3 glossy;
+ float3 transmission;
+ float3 subsurface;
+ float3 scatter;
+
+ float3 direct;
+#endif
+} PathRadianceState;
+
typedef ccl_addr_space struct PathRadiance {
#ifdef __PASSES__
int use_light_pass;
#endif
+ float transparent;
float3 emission;
#ifdef __PASSES__
float3 background;
float3 ao;
float3 indirect;
- float3 direct_throughput;
float3 direct_emission;
float3 color_diffuse;
@@ -494,16 +510,12 @@ typedef ccl_addr_space struct PathRadiance {
float3 indirect_subsurface;
float3 indirect_scatter;
- float3 path_diffuse;
- float3 path_glossy;
- float3 path_transmission;
- float3 path_subsurface;
- float3 path_scatter;
-
float4 shadow;
float mist;
#endif
+ struct PathRadianceState state;
+
#ifdef __SHADOW_TRICKS__
/* Total light reachable across the path, ignoring shadow blocked queries. */
float3 path_total;
@@ -515,7 +527,18 @@ typedef ccl_addr_space struct PathRadiance {
float3 path_total_shaded;
/* Color of the background on which shadow is alpha-overed. */
- float3 shadow_color;
+ float3 shadow_background_color;
+
+ /* Path radiance sum and throughput at the moment when ray hits shadow
+ * catcher object.
+ */
+ float shadow_throughput;
+
+ /* Accumulated transparency along the path after shadow catcher bounce. */
+ float shadow_transparency;
+
+ /* Indicate if any shadow catcher data is set. */
+ int has_shadow_catcher;
#endif
#ifdef __DENOISING_FEATURES__
@@ -523,6 +546,10 @@ typedef ccl_addr_space struct PathRadiance {
float3 denoising_albedo;
float denoising_depth;
#endif /* __DENOISING_FEATURES__ */
+
+#ifdef __KERNEL_DEBUG__
+ DebugData debug_data;
+#endif /* __KERNEL_DEBUG__ */
} PathRadiance;
typedef struct BsdfEval {
@@ -774,20 +801,6 @@ typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
float data[10]; /* pad to 80 bytes */
} ShaderClosure;
-/* Shader Context
- *
- * For OSL we recycle a fixed number of contexts for speed */
-
-typedef enum ShaderContext {
- SHADER_CONTEXT_MAIN = 0,
- SHADER_CONTEXT_INDIRECT = 1,
- SHADER_CONTEXT_EMISSION = 2,
- SHADER_CONTEXT_SHADOW = 3,
- SHADER_CONTEXT_SSS = 4,
- SHADER_CONTEXT_VOLUME = 5,
- SHADER_CONTEXT_NUM = 6
-} ShaderContext;
-
/* Shader Data
*
* Main shader state at a point on the surface or in a volume. All coordinates
@@ -850,7 +863,7 @@ enum ShaderDataFlag {
SD_VOLUME_MIS = (1 << 23),
/* Use cubic interpolation for voxels. */
SD_VOLUME_CUBIC = (1 << 24),
- /* Has data connected to the displacement input. */
+ /* Has data connected to the displacement input or uses bump map. */
SD_HAS_BUMP = (1 << 25),
/* Has true displacement. */
SD_HAS_DISPLACEMENT = (1 << 26),
@@ -991,9 +1004,11 @@ typedef struct PathState {
int flag;
/* random number generator state */
- int rng_offset; /* dimension offset */
- int sample; /* path sample number */
- int num_samples; /* total number of times this path will be sampled */
+ uint rng_hash; /* per pixel hash */
+ int rng_offset; /* dimension offset */
+ int sample; /* path sample number */
+ int num_samples; /* total number of times this path will be sampled */
+ float branch_factor; /* number of branches in indirect paths */
/* bounce counting */
int bounce;
@@ -1016,20 +1031,15 @@ typedef struct PathState {
/* volume rendering */
#ifdef __VOLUME__
int volume_bounce;
- RNG rng_congruential;
+ uint rng_congruential;
VolumeStack volume_stack[VOLUME_STACK_SIZE];
#endif
-
-#ifdef __SHADOW_TRICKS__
- int catcher_object;
-#endif
} PathState;
/* Subsurface */
/* Struct to gather multiple SSS hits. */
-typedef struct SubsurfaceIntersection
-{
+typedef struct SubsurfaceIntersection {
Ray ray;
float3 weight[BSSRDF_MAX_HITS];
@@ -1039,17 +1049,14 @@ typedef struct SubsurfaceIntersection
} SubsurfaceIntersection;
/* Struct to gather SSS indirect rays and delay tracing them. */
-typedef struct SubsurfaceIndirectRays
-{
- bool need_update_volume_stack;
- bool tracing;
+typedef struct SubsurfaceIndirectRays {
PathState state[BSSRDF_MAX_HITS];
- struct PathRadiance direct_L;
int num_rays;
+
struct Ray rays[BSSRDF_MAX_HITS];
float3 throughputs[BSSRDF_MAX_HITS];
- struct PathRadiance L[BSSRDF_MAX_HITS];
+ struct PathRadianceState L_state[BSSRDF_MAX_HITS];
} SubsurfaceIndirectRays;
/* Constant Kernel Data
@@ -1228,7 +1235,6 @@ typedef struct KernelIntegrator {
int portal_offset;
/* bounces */
- int min_bounce;
int max_bounce;
int max_diffuse_bounce;
@@ -1239,7 +1245,6 @@ typedef struct KernelIntegrator {
int ao_bounces;
/* transparent */
- int transparent_min_bounce;
int transparent_max_bounce;
int transparent_shadows;
@@ -1282,7 +1287,7 @@ typedef struct KernelIntegrator {
float light_inv_rr_threshold;
int start_sample;
- int pad1, pad2, pad3;
+ int pad1;
} KernelIntegrator;
static_assert_align(KernelIntegrator, 16);
@@ -1336,18 +1341,6 @@ typedef struct KernelData {
} KernelData;
static_assert_align(KernelData, 16);
-#ifdef __KERNEL_DEBUG__
-/* NOTE: This is a runtime-only struct, alignment is not
- * really important here.
- */
-typedef ccl_addr_space struct DebugData {
- int num_bvh_traversed_nodes;
- int num_bvh_traversed_instances;
- int num_bvh_intersections;
- int num_ray_bounces;
-} DebugData;
-#endif
-
/* Declarations required for split kernel */
/* Macro for queues */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 1e472aaf51a..d9c310a893e 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -43,7 +43,7 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
float3 *extinction)
{
sd->P = P;
- shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+ shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER)))
return false;
@@ -69,7 +69,7 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
VolumeShaderCoefficients *coeff)
{
sd->P = P;
- shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME);
+ shader_eval_volume(kg, sd, state, state->volume_stack, state->flag);
if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER|SD_EMISSION)))
return false;
@@ -360,7 +360,6 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
ShaderData *sd,
PathRadiance *L,
ccl_addr_space float3 *throughput,
- RNG *rng,
bool probalistic_scatter)
{
VolumeShaderCoefficients coeff;
@@ -380,13 +379,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
/* pick random color channel, we use the Veach one-sample
* model with balance heuristic for the channels */
- float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+ float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
int channel = (int)(rphase*3.0f);
- sd->randb_closure = rphase*3.0f - channel;
/* decide if we will hit or miss */
bool scatter = true;
- float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+ float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
if(probalistic_scatter) {
float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
@@ -439,7 +437,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
float3 transmittance = volume_color_transmittance(sigma_t, ray->t);
float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t);
- path_radiance_accum_emission(L, *throughput, emission, state->bounce);
+ path_radiance_accum_emission(L, state, *throughput, emission);
}
/* modify throughput */
@@ -468,8 +466,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
Ray *ray,
ShaderData *sd,
PathRadiance *L,
- ccl_addr_space float3 *throughput,
- RNG *rng)
+ ccl_addr_space float3 *throughput)
{
float3 tp = *throughput;
const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -485,10 +482,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
/* pick random color channel, we use the Veach one-sample
* model with balance heuristic for the channels */
- float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
- float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+ float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
int channel = (int)(rphase*3.0f);
- sd->randb_closure = rphase*3.0f - channel;
bool has_scatter = false;
for(int i = 0; i < max_steps; i++) {
@@ -560,7 +556,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
/* integrate emission attenuated by absorption */
if(L && (closure_flag & SD_EMISSION)) {
float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
- path_radiance_accum_emission(L, tp, emission, state->bounce);
+ path_radiance_accum_emission(L, state, tp, emission);
}
/* modify throughput */
@@ -610,15 +606,14 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(
Ray *ray,
PathRadiance *L,
ccl_addr_space float3 *throughput,
- RNG *rng,
bool heterogeneous)
{
shader_setup_from_volume(kg, sd, ray);
if(heterogeneous)
- return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, rng);
+ return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput);
else
- return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng, true);
+ return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
}
#ifndef __SPLIT_KERNEL__
@@ -846,7 +841,6 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
/* pick random color channel, we use the Veach one-sample
* model with balance heuristic for the channels */
int channel = (int)(rphase*3.0f);
- sd->randb_closure = rphase*3.0f - channel;
float xi = rscatter;
/* probabilistic scattering decision based on transmittance */
@@ -1000,8 +994,8 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
}
}
- if(sample_t < 1e-6f || pdf == 0.0f) {
- return VOLUME_PATH_SCATTERED;
+ if(sample_t < 0.0f || pdf == 0.0f) {
+ return VOLUME_PATH_MISSED;
}
/* compute transmittance up to this step */
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 28fc5ce1c30..0c11158e8da 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -27,90 +27,54 @@ CCL_NAMESPACE_BEGIN
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
-ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
-{
- return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples;
-}
-
-ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
-{
- return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index)
-{
- return ray_index / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
-{
- uint total_work_size = kernel_total_work_size(kg);
- uint num_pools = kernel_num_work_pools(kg);
-
- if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) {
- return 0;
- }
-
- uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE;
-
- uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
- if(work_pool < remainder / WORK_POOL_SIZE) {
- work_size += WORK_POOL_SIZE;
- }
- else if(work_pool == remainder / WORK_POOL_SIZE) {
- work_size += remainder % WORK_POOL_SIZE;
- }
-
- return work_size;
-}
-
-ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index)
-{
- uint num_pools = kernel_num_work_pools(kg);
- uint pool = work_pool_from_ray_index(kg, ray_index);
-
- return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
- + (pool * WORK_POOL_SIZE)
- + (work_index % WORK_POOL_SIZE);
-}
-
/* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index)
+ccl_device bool get_next_work(KernelGlobals *kg,
+ uint thread_index,
+ ccl_private uint *global_work_index)
{
- uint work_pool = work_pool_from_ray_index(kg, ray_index);
- uint pool_size = work_pool_work_size(kg, work_pool);
+ uint total_work_size = kernel_split_params.w
+ * kernel_split_params.h
+ * kernel_split_params.num_samples;
- if(pool_size == 0) {
+ /* With a small amount of work there may be more threads than work due to
+ * rounding up of global size, stop such threads immediately. */
+ if(thread_index >= total_work_size) {
return false;
}
- *work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
- return (*work_index < pool_size);
-}
+ /* Increase atomic work index counter in pool. */
+ uint pool = thread_index / WORK_POOL_SIZE;
+ uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]);
-/* This function assumes that the passed `work` is valid. */
-/* Decode sample number w.r.t. assigned `work`. */
-ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index)
-{
- return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h);
-}
+ /* Map per-pool work index to a global work index. */
+ uint global_size = ccl_global_size(0) * ccl_global_size(1);
+ kernel_assert(global_size % WORK_POOL_SIZE == 0);
+ kernel_assert(thread_index < global_size);
-/* Decode pixel and tile position w.r.t. assigned `work`. */
-ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
- ccl_private uint *pixel_x,
- ccl_private uint *pixel_y,
- ccl_private uint *tile_x,
- ccl_private uint *tile_y,
- uint work_index,
- uint ray_index)
-{
- uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h);
+ *global_work_index = (work_index / WORK_POOL_SIZE) * global_size
+ + (pool * WORK_POOL_SIZE)
+ + (work_index % WORK_POOL_SIZE);
- *tile_x = pixel_index % kernel_split_params.w;
- *tile_y = pixel_index / kernel_split_params.w;
+ /* Test if all work for this pool is done. */
+ return (*global_work_index < total_work_size);
+}
- *pixel_x = *tile_x + kernel_split_params.x;
- *pixel_y = *tile_y + kernel_split_params.y;
+/* Map global work index to pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(KernelGlobals *kg,
+ uint global_work_index,
+ ccl_private uint *x,
+ ccl_private uint *y,
+ ccl_private uint *sample)
+{
+ uint tile_pixels = kernel_split_params.w * kernel_split_params.h;
+ uint sample_offset = global_work_index / tile_pixels;
+ uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+ uint y_offset = pixel_offset / kernel_split_params.w;
+ uint x_offset = pixel_offset - y_offset * kernel_split_params.w;
+
+ *x = kernel_split_params.x + x_offset;
+ *y = kernel_split_params.y + y_offset;
+ *sample = kernel_split_params.start_sample + sample_offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
index 1a7b2040da1..254025be4e2 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -25,6 +25,7 @@
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
index 9fa39dc9ebb..7ae205b7e14 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -81,8 +81,13 @@
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
#endif
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
+/* For split kernel using all registers seems fastest for now, but this
+ * is unlikely to be optimal once we resolve other bottlenecks. */
+
+#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
__launch_bounds__( \
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 628891b1458..e97e87285a5 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init(
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
extern "C" __global__ void \
- CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
kernel_cuda_##name() \
{ \
kernel_##name(NULL); \
@@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init(
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
extern "C" __global__ void \
- CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
kernel_cuda_##name() \
{ \
ccl_local type locals; \
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index ba53ba4b26f..f015ac47d8a 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -235,7 +235,7 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
}
__kernel void kernel_ocl_filter_finalize(int w,
- int h,
+ int h,
ccl_global float *buffer,
ccl_global int *rank,
ccl_global float *XtWX,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 078acc1631e..b7108f3d0f8 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -52,9 +52,7 @@ __kernel void kernel_ocl_path_trace(
ccl_global float *buffer,
ccl_global uint *rng_state,
-#define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name,
-#include "kernel/kernel_textures.h"
+ KERNEL_BUFFER_PARAMS,
int sample,
int sx, int sy, int sw, int sh, int offset, int stride)
@@ -63,9 +61,8 @@ __kernel void kernel_ocl_path_trace(
kg->data = data;
-#define KERNEL_TEX(type, ttype, name) \
- kg->name = name;
-#include "kernel/kernel_textures.h"
+ kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+ kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
int y = sy + ccl_global_id(1);
@@ -82,9 +79,7 @@ __kernel void kernel_ocl_shader(
ccl_global float4 *output,
ccl_global float *output_luma,
-#define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name,
-#include "kernel/kernel_textures.h"
+ KERNEL_BUFFER_PARAMS,
int type, int sx, int sw, int offset, int sample)
{
@@ -92,9 +87,8 @@ __kernel void kernel_ocl_shader(
kg->data = data;
-#define KERNEL_TEX(type, ttype, name) \
- kg->name = name;
-#include "kernel/kernel_textures.h"
+ kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+ kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
@@ -114,9 +108,7 @@ __kernel void kernel_ocl_bake(
ccl_global uint4 *input,
ccl_global float4 *output,
-#define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name,
-#include "kernel/kernel_textures.h"
+ KERNEL_BUFFER_PARAMS,
int type, int filter, int sx, int sw, int offset, int sample)
{
@@ -124,9 +116,8 @@ __kernel void kernel_ocl_bake(
kg->data = data;
-#define KERNEL_TEX(type, ttype, name) \
- kg->name = name;
-#include "kernel/kernel_textures.h"
+ kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+ kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
@@ -144,9 +135,7 @@ __kernel void kernel_ocl_convert_to_byte(
ccl_global uchar4 *rgba,
ccl_global float *buffer,
-#define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name,
-#include "kernel/kernel_textures.h"
+ KERNEL_BUFFER_PARAMS,
float sample_scale,
int sx, int sy, int sw, int sh, int offset, int stride)
@@ -155,9 +144,8 @@ __kernel void kernel_ocl_convert_to_byte(
kg->data = data;
-#define KERNEL_TEX(type, ttype, name) \
- kg->name = name;
-#include "kernel/kernel_textures.h"
+ kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+ kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
int y = sy + ccl_global_id(1);
@@ -171,9 +159,7 @@ __kernel void kernel_ocl_convert_to_half_float(
ccl_global uchar4 *rgba,
ccl_global float *buffer,
-#define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name,
-#include "kernel/kernel_textures.h"
+ KERNEL_BUFFER_PARAMS,
float sample_scale,
int sx, int sy, int sw, int sh, int offset, int stride)
@@ -182,9 +168,8 @@ __kernel void kernel_ocl_convert_to_half_float(
kg->data = data;
-#define KERNEL_TEX(type, ttype, name) \
- kg->name = name;
-#include "kernel/kernel_textures.h"
+ kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+ kernel_set_buffer_info(kg);
int x = sx + ccl_global_id(0);
int y = sy + ccl_global_id(1);
@@ -193,7 +178,7 @@ __kernel void kernel_ocl_convert_to_half_float(
kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
}
-__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset)
+__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset)
{
size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 8b85d362f8a..95b35e40a45 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -25,11 +25,7 @@ __kernel void kernel_ocl_path_trace_data_init(
int num_elements,
ccl_global char *ray_state,
ccl_global uint *rng_state,
-
-#define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name,
-#include "kernel/kernel_textures.h"
-
+ KERNEL_BUFFER_PARAMS,
int start_sample,
int end_sample,
int sx, int sy, int sw, int sh, int offset, int stride,
@@ -46,10 +42,7 @@ __kernel void kernel_ocl_path_trace_data_init(
num_elements,
ray_state,
rng_state,
-
-#define KERNEL_TEX(type, ttype, name) name,
-#include "kernel/kernel_textures.h"
-
+ KERNEL_BUFFER_ARGS,
start_sample,
end_sample,
sx, sy, sw, sh, offset, stride,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
index 651addb02f4..4cbda1bc2e7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -14,6 +14,9 @@
* limitations under the License.
*/
+#include "kernel/kernel_compat_opencl.h" // PRECOMPILED
+#include "kernel/split/kernel_split_common.h" // PRECOMPILED
+
#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
#include "kernel/kernels/opencl/kernel_data_init.cl"
#include "kernel/kernels/opencl/kernel_path_init.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
index f1e914a70d4..591c3846ef2 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
@@ -25,9 +25,7 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
ccl_global char *ray_state,
ccl_global uint *rng_state,
-#define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name,
-#include "kernel/kernel_textures.h"
+ KERNEL_BUFFER_PARAMS,
ccl_global int *queue_index,
ccl_global char *use_queues_flag,
@@ -52,12 +50,9 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
-#define KERNEL_TEX(type, ttype, name) \
- kg->name = name;
-#include "kernel/kernel_textures.h"
}
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
+ kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
kg
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 02c083a83f8..9585d9f4825 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -86,7 +86,7 @@ struct OSLThreadData {
OSL::ShaderGlobals globals;
OSL::PerThreadInfo *osl_thread_info;
OSLTraceData tracedata;
- OSL::ShadingContext *context[SHADER_CONTEXT_NUM];
+ OSL::ShadingContext *context;
OIIO::TextureSystem::Perthread *oiio_thread_info;
};
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 1535496c73d..8ad2e12b067 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -1197,8 +1197,9 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
tracedata->init = true;
tracedata->sd.osl_globals = sd->osl_globals;
- /* raytrace */
- return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
+ /* Raytrace, leaving out shadow opaque to avoid early exit. */
+ uint visibility = PATH_RAY_ALL_VISIBILITY - PATH_RAY_SHADOW_OPAQUE;
+ return scene_intersect(sd->osl_globals, ray, visibility, &tracedata->isect, NULL, 0.0f, 0.0f);
}
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 13b19d86eca..9a37e0987aa 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -57,9 +57,7 @@ void OSLShader::thread_init(KernelGlobals *kg, KernelGlobals *kernel_globals, OS
tdata->globals.tracedata = &tdata->tracedata;
tdata->globals.flipHandedness = false;
tdata->osl_thread_info = ss->create_thread_info();
-
- for(int i = 0; i < SHADER_CONTEXT_NUM; i++)
- tdata->context[i] = ss->get_context(tdata->osl_thread_info);
+ tdata->context = ss->get_context(tdata->osl_thread_info);
tdata->oiio_thread_info = osl_globals->ts->get_perthread_info();
@@ -74,9 +72,7 @@ void OSLShader::thread_free(KernelGlobals *kg)
OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
OSLThreadData *tdata = kg->osl_tdata;
-
- for(int i = 0; i < SHADER_CONTEXT_NUM; i++)
- ss->release_context(tdata->context[i]);
+ ss->release_context(tdata->context);
ss->destroy_thread_info(tdata->osl_thread_info);
@@ -173,7 +169,7 @@ static void flatten_surface_closure_tree(ShaderData *sd,
}
}
-void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
@@ -182,7 +178,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state
/* execute shader for this point */
OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
OSL::ShaderGlobals *globals = &tdata->globals;
- OSL::ShadingContext *octx = tdata->context[(int)ctx];
+ OSL::ShadingContext *octx = tdata->context;
int shader = sd->shader & SHADER_MASK;
/* automatic bump shader */
@@ -274,7 +270,7 @@ static void flatten_background_closure_tree(ShaderData *sd,
}
}
-void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
@@ -283,7 +279,7 @@ void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *st
/* execute shader for this point */
OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
OSL::ShaderGlobals *globals = &tdata->globals;
- OSL::ShadingContext *octx = tdata->context[(int)ctx];
+ OSL::ShadingContext *octx = tdata->context;
if(kg->osl->background_state) {
ss->execute(octx, *(kg->osl->background_state), *globals);
@@ -329,7 +325,7 @@ static void flatten_volume_closure_tree(ShaderData *sd,
}
}
-void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
@@ -338,7 +334,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
/* execute shader */
OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
OSL::ShaderGlobals *globals = &tdata->globals;
- OSL::ShadingContext *octx = tdata->context[(int)ctx];
+ OSL::ShadingContext *octx = tdata->context;
int shader = sd->shader & SHADER_MASK;
if(kg->osl->volume_state[shader]) {
@@ -352,7 +348,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
/* Displacement */
-void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
+void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
@@ -364,7 +360,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderConte
/* execute shader */
OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
OSL::ShaderGlobals *globals = &tdata->globals;
- OSL::ShadingContext *octx = tdata->context[(int)ctx];
+ OSL::ShadingContext *octx = tdata->context;
int shader = sd->shader & SHADER_MASK;
if(kg->osl->displacement_state[shader]) {
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index 32121e940b4..f7020d1223d 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -53,10 +53,10 @@ public:
static void thread_free(KernelGlobals *kg);
/* eval */
- static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
- static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
- static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
- static void eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx);
+ static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+ static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+ static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+ static void eval_displacement(KernelGlobals *kg, ShaderData *sd);
/* attributes */
static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc);
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 2bb981c3918..6870d479af3 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -76,8 +76,8 @@ shader node_principled_bsdf(
float aspect = sqrt(1.0 - Anisotropic * 0.9);
float r2 = Roughness * Roughness;
- float alpha_x = max(0.001, r2 / aspect);
- float alpha_y = max(0.001, r2 * aspect);
+ float alpha_x = r2 / aspect;
+ float alpha_y = r2 * aspect;
color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint;
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
index e2762a85fc8..2313feac089 100644
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -87,7 +87,6 @@ ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals
PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
- inactive_L->direct_throughput = L->direct_throughput;
path_radiance_copy_indirect(inactive_L, L);
ray_state[inactive_ray] = RAY_REGENERATED;
@@ -110,7 +109,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
ShaderData *sd = saved_sd;
- RNG rng = kernel_split_state.rng[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
float3 throughput = branched_state->throughput;
ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
@@ -157,37 +155,38 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
num_samples = ceil_to_int(num_samples_adjust*num_samples);
float num_samples_inv = num_samples_adjust/num_samples;
- RNG bsdf_rng = cmj_hash(rng, i);
for(int j = branched_state->next_sample; j < num_samples; j++) {
if(reset_path_state) {
*ps = branched_state->path_state;
}
+ ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
+
ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
*tp = throughput;
ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
if(!kernel_branched_path_surface_bounce(kg,
- &bsdf_rng,
sd,
sc,
j,
num_samples,
tp,
ps,
- L,
+ &L->state,
bsdf_ray,
sum_sample_weight))
{
continue;
}
+ ps->rng_hash = branched_state->path_state.rng_hash;
+
/* update state for next iteration */
branched_state->next_closure = i;
branched_state->next_sample = j+1;
- branched_state->num_samples = num_samples;
/* start the indirect path */
*tp *= num_samples_inv;
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 4c1fdd2d69c..c9e7deddafa 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -75,92 +75,59 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
if(ray_index != QUEUE_EMPTY_SLOT) {
#endif
- ccl_global uint *rng_state = kernel_split_params.rng_state;
int stride = kernel_split_params.stride;
ccl_global char *ray_state = kernel_split_state.ray_state;
-#ifdef __KERNEL_DEBUG__
- DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
-#endif
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
- ccl_global float *buffer = kernel_split_params.buffer;
-
- unsigned int work_index;
- ccl_global uint *initial_rng;
-
- unsigned int sample;
- unsigned int tile_x;
- unsigned int tile_y;
- unsigned int pixel_x;
- unsigned int pixel_y;
-
- work_index = kernel_split_state.work_array[ray_index];
- sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
- &tile_x, &tile_y,
- work_index,
- ray_index);
- initial_rng = rng_state;
-
- rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
- buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-#ifdef __KERNEL_DEBUG__
- kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
-#endif
+ uint sample = state->sample;
+ uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+ ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
/* accumulate result in output buffer */
- bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
- kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
-
- path_rng_end(kg, rng_state, rng);
+ kernel_write_result(kg, buffer, sample, L);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
}
if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
/* We have completed current work; So get next work */
- int valid_work = get_next_work(kg, &work_index, ray_index);
- if(!valid_work) {
+ uint work_index;
+ if(!get_next_work(kg, ray_index, &work_index)) {
/* If work is invalid, this means no more work is available and the thread may exit */
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
}
if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
- kernel_split_state.work_array[ray_index] = work_index;
- /* Get the sample associated with the current work */
- sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
- /* Get pixel and tile position associated with current work */
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
+ uint x, y, sample;
+ get_work_pixel(kg, work_index, &x, &y, &sample);
- /* Remap rng_state according to the current work */
- rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride;
- /* Remap buffer according to the current work */
- buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+ /* Remap rng_state to current pixel. */
+ ccl_global uint *rng_state = kernel_split_params.rng_state;
+ rng_state += kernel_split_params.offset + x + y*stride;
+
+ /* Store buffer offset for writing to passes. */
+ uint buffer_offset = (kernel_split_params.offset + x + y*stride) * kernel_data.film.pass_stride;
+ kernel_split_state.buffer_offset[ray_index] = buffer_offset;
/* Initialize random numbers and ray. */
- kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng, ray);
+ uint rng_hash;
+ kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, ray);
if(ray->t != 0.0f) {
- /* Initialize throughput, L_transparent, Ray, PathState;
+ /* Initialize throughput, path radiance, Ray, PathState;
* These rays proceed with path-iteration.
*/
*throughput = make_float3(1.0f, 1.0f, 1.0f);
- *L_transparent = 0.0f;
path_radiance_init(L, kernel_data.film.use_light_pass);
- path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &rng, sample, ray);
+ path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng_hash, sample, ray);
#ifdef __SUBSURFACE__
kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
#endif
-#ifdef __KERNEL_DEBUG__
- debug_data_init(debug_data);
-#endif
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
enqueue_flag = 1;
}
@@ -168,14 +135,13 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
/* These rays do not participate in path-iteration. */
float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* Accumulate result in output buffer. */
+ ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
kernel_write_pass_float4(buffer, sample, L_rad);
- path_rng_end(kg, rng_state, rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
}
}
}
- kernel_split_state.rng[ray_index] = rng;
#ifndef __COMPUTE_DEVICE_GPU__
}
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index e4545d66eff..2c042dfde6f 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -52,9 +52,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
ccl_global uint *rng_state,
#ifdef __KERNEL_OPENCL__
-#define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name,
-#include "kernel/kernel_textures.h"
+ KERNEL_BUFFER_PARAMS,
#endif
int start_sample,
@@ -100,9 +98,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
#ifdef __KERNEL_OPENCL__
-#define KERNEL_TEX(type, ttype, name) \
- kg->name = name;
-#include "kernel/kernel_textures.h"
+ kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+ kernel_set_buffer_info(kg);
#endif
int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
@@ -127,14 +124,25 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
/* zero the tiles pixels and initialize rng_state if this is the first sample */
if(start_sample == 0) {
- parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) {
- int pixel = i / kernel_data.film.pass_stride;
- int pass = i % kernel_data.film.pass_stride;
+ int pass_stride = kernel_data.film.pass_stride;
+
+#ifdef __KERNEL_CPU__
+ for(int y = sy; y < sy + sh; y++) {
+ int index = offset + y * stride;
+ memset(buffer + (sx + index) * pass_stride, 0, sizeof(float) * pass_stride * sw);
+ for(int x = sx; x < sx + sw; x++) {
+ rng_state[index + x] = hash_int_2d(x, y);
+ }
+ }
+#else
+ parallel_for(kg, i, sw * sh * pass_stride) {
+ int pixel = i / pass_stride;
+ int pass = i % pass_stride;
int x = sx + pixel % sw;
int y = sy + pixel / sw;
- int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass;
+ int index = (offset + x + y*stride) * pass_stride + pass;
*(buffer + index) = 0.0f;
}
@@ -146,6 +154,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
int index = (offset + x + y*stride);
*(rng_state + index) = hash_int_2d(x, y);
}
+#endif
}
#endif /* KERENL_STUB */
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 3336c968a44..2aac66ecb84 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -62,8 +62,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
/* direct lighting */
#ifdef __EMISSION__
- RNG rng = kernel_split_state.rng[ray_index];
-
bool flag = (kernel_data.integrator.use_direct_light &&
(sd->flag & SD_BSDF_HAS_EVAL));
@@ -83,23 +81,20 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
if(flag) {
/* Sample illumination from lights to find path contribution. */
- float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
float light_u, light_v;
- path_state_rng_2D(kg, &rng, state, PRNG_LIGHT_U, &light_u, &light_v);
- float terminate = path_state_rng_light_termination(kg, &rng, state);
+ path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+ float terminate = path_state_rng_light_termination(kg, state);
LightSample ls;
if(light_sample(kg,
- light_t, light_u, light_v,
+ light_u, light_v,
sd->time,
sd->P,
state->bounce,
&ls)) {
Ray light_ray;
-# ifdef __OBJECT_MOTION__
light_ray.time = sd->time;
-# endif
BsdfEval L_light;
bool is_lamp;
@@ -115,7 +110,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
}
}
}
- kernel_split_state.rng[ray_index] = rng;
#endif /* __EMISSION__ */
}
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index 9f8dd2392d9..491487f1230 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -30,7 +30,6 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
ShaderData *sd = &kernel_split_state.sd[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
@@ -58,22 +57,21 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
/* integrate along volume segment with distance sampling */
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous);
+ kg, ps, sd, &volume_ray, L, tp, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
- kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L);
+ kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
/* indirect light bounce */
- if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) {
+ if(!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
continue;
}
/* start the indirect path */
branched_state->next_closure = 0;
branched_state->next_sample = j+1;
- branched_state->num_samples = num_samples;
/* Attempting to share too many samples is slow for volumes as it causes us to
* loop here more and have many calls to kernel_volume_integrate which evaluates
@@ -141,7 +139,6 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
ShaderData *sd = &kernel_split_state.sd[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
@@ -165,15 +162,15 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
{
/* integrate along volume segment with distance sampling */
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+ kg, state, sd, &volume_ray, L, throughput, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
- kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L);
+ kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
/* indirect light bounce */
- if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+ if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
}
else {
@@ -194,8 +191,6 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
}
# endif /* __BRANCHED_PATH__ */
}
-
- kernel_split_state.rng[ray_index] = rng;
}
# ifdef __BRANCHED_PATH__
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 670a557f084..dffd291012d 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -90,163 +90,58 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
if(ray_index != QUEUE_EMPTY_SLOT) {
#endif
- int stride = kernel_split_params.stride;
-
- unsigned int work_index;
- unsigned int pixel_x;
- unsigned int pixel_y;
-
- unsigned int tile_x;
- unsigned int tile_y;
- unsigned int sample;
-
- RNG rng = kernel_split_state.rng[ray_index];
ccl_global PathState *state = 0x0;
float3 throughput;
ccl_global char *ray_state = kernel_split_state.ray_state;
ShaderData *sd = &kernel_split_state.sd[ray_index];
- ccl_global float *buffer = kernel_split_params.buffer;
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+ ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
+
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
throughput = kernel_split_state.throughput[ray_index];
state = &kernel_split_state.path_state[ray_index];
- work_index = kernel_split_state.work_array[ray_index];
- sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
- &tile_x, &tile_y,
- work_index,
- ray_index);
-
- buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
-
-#ifdef __SHADOW_TRICKS__
- if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
- if(state->flag & PATH_RAY_CAMERA) {
- state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
- state->catcher_object = sd->object;
- if(!kernel_data.background.transparent) {
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
- }
- }
- }
- else {
- state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
- }
-#endif /* __SHADOW_TRICKS__ */
-
- /* holdout */
-#ifdef __HOLDOUT__
- if(((sd->flag & SD_HOLDOUT) ||
- (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
- (state->flag & PATH_RAY_CAMERA))
+ if(!kernel_path_shader_apply(kg,
+ sd,
+ state,
+ ray,
+ throughput,
+ emission_sd,
+ L,
+ buffer))
{
- if(kernel_data.background.transparent) {
- float3 holdout_weight;
- if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
- holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
- }
- else {
- holdout_weight = shader_holdout_eval(kg, sd);
- }
- /* any throughput is ok, should all be identical here */
- kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
- }
- if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
- kernel_split_path_end(kg, ray_index);
- }
+ kernel_split_path_end(kg, ray_index);
}
-#endif /* __HOLDOUT__ */
}
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-#ifdef __BRANCHED_PATH__
- if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
-#endif /* __BRANCHED_PATH__ */
- {
- /* Holdout mask objects do not write data passes. */
- kernel_write_data_passes(kg,
- buffer,
- L,
- sd,
- sample,
- state,
- throughput);
- }
-
- /* Blurring of bsdf after bounces, for rays that have a small likelihood
- * of following this particular path (diffuse, rough glossy.
- */
-#ifndef __BRANCHED_PATH__
- if(kernel_data.integrator.filter_glossy != FLT_MAX)
-#else
- if(kernel_data.integrator.filter_glossy != FLT_MAX &&
- (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)))
-#endif /* __BRANCHED_PATH__ */
- {
- float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
- if(blur_pdf < 1.0f) {
- float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
- shader_bsdf_blur(kg, sd, blur_roughness);
- }
- }
-
-#ifdef __EMISSION__
- /* emission */
- if(sd->flag & SD_EMISSION) {
- /* TODO(sergey): is isect.t wrong here for transparent surfaces? */
- float3 emission = indirect_primitive_emission(
- kg,
- sd,
- kernel_split_state.isect[ray_index].t,
- state->flag,
- state->ray_pdf);
- path_radiance_accum_emission(L, throughput, emission, state->bounce);
- }
-#endif /* __EMISSION__ */
-
/* Path termination. this is a strange place to put the termination, it's
* mainly due to the mixed in MIS that we use. gives too many unneeded
* shader evaluations, only need emission if we are going to terminate.
*/
-#ifndef __BRANCHED_PATH__
- float probability = path_state_terminate_probability(kg, state, throughput);
-#else
- float probability = 1.0f;
-
- if(!kernel_data.integrator.branched) {
- probability = path_state_terminate_probability(kg, state, throughput);
- }
- else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
- int num_samples = kernel_split_state.branched_state[ray_index].num_samples;
- probability = path_state_terminate_probability(kg, state, throughput*num_samples);
- }
- else if(state->flag & PATH_RAY_TRANSPARENT) {
- probability = path_state_terminate_probability(kg, state, throughput);
- }
-#endif
+ float probability = path_state_continuation_probability(kg, state, throughput);
if(probability == 0.0f) {
kernel_split_path_end(kg, ray_index);
}
-
- if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- if(probability != 1.0f) {
- float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
- if(terminate >= probability) {
- kernel_split_path_end(kg, ray_index);
- }
- else {
- kernel_split_state.throughput[ray_index] = throughput/probability;
- }
+ else if(probability < 1.0f) {
+ float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
+ if(terminate >= probability) {
+ kernel_split_path_end(kg, ray_index);
}
+ else {
+ kernel_split_state.throughput[ray_index] = throughput/probability;
+ }
+ }
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
kernel_update_denoising_features(kg, sd, state, L);
}
}
@@ -260,8 +155,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
}
#endif /* __AO__ */
- kernel_split_state.rng[ray_index] = rng;
-
#ifndef __COMPUTE_DEVICE_GPU__
}
#endif
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
index f0ebb90f60a..437043a5971 100644
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -33,7 +33,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
if(ray_index != QUEUE_EMPTY_SLOT) {
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- if(state->bounce > kernel_data.integrator.ao_bounces) {
+ if(path_state_ao_bounce(kg, state)) {
kernel_split_path_end(kg, ray_index);
}
}
@@ -50,33 +50,16 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
return;
}
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
-
if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
- /* eval background shader if nothing hit */
- if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
- *L_transparent = (*L_transparent) + average((*throughput));
-#ifdef __PASSES__
- if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
- kernel_split_path_end(kg, ray_index);
- }
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ float3 throughput = kernel_split_state.throughput[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
- if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-#ifdef __BACKGROUND__
- /* sample background shader */
- float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
- path_radiance_accum_background(L, state, (*throughput), L_background);
-#endif
- kernel_split_path_end(kg, ray_index);
- }
+ kernel_path_background(kg, state, ray, throughput, emission_sd, L);
+ kernel_split_path_end(kg, ray_index);
}
-
-
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
index 82bc2f01fd7..e9fe5552e8c 100644
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -54,7 +54,6 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
#endif
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
- kernel_path_subsurface_accum_indirect(ss_indirect, L);
/* Trace indirect subsurface rays by restarting the loop. this uses less
* stack memory than invoking kernel_path_indirect.
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index c669d79ddcd..448456d167d 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -57,27 +57,10 @@ ccl_device void kernel_lamp_emission(KernelGlobals *kg)
float3 throughput = kernel_split_state.throughput[ray_index];
Ray ray = kernel_split_state.ray[ray_index];
+ ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+ ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
-#ifdef __LAMP_MIS__
- if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
- /* ray starting from previous non-transparent bounce */
- Ray light_ray;
-
- light_ray.P = ray.P - state->ray_t*ray.D;
- state->ray_t += kernel_split_state.isect[ray_index].t;
- light_ray.D = ray.D;
- light_ray.t = state->ray_t;
- light_ray.time = ray.time;
- light_ray.dD = ray.dD;
- light_ray.dP = ray.dP;
- /* intersect with lamp */
- float3 emission;
-
- if(indirect_lamp_emission(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &light_ray, &emission)) {
- path_radiance_accum_emission(L, throughput, emission, state->bounce);
- }
- }
-#endif /* __LAMP_MIS__ */
+ kernel_path_lamp_emission(kg, state, &ray, throughput, isect, emission_sd, L);
}
}
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 7758e35fd32..c3373174582 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -126,7 +126,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
if(active) {
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
ShaderData *sd = &kernel_split_state.sd[ray_index];
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -135,7 +134,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
#endif
/* Compute direct lighting and next bounce. */
- if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+ if(!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
kernel_split_path_end(kg, ray_index);
}
#ifdef __BRANCHED_PATH__
@@ -157,8 +156,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
}
}
#endif /* __BRANCHED_PATH__ */
-
- kernel_split_state.rng[ray_index] = rng;
}
/* Enqueue RAY_UPDATE_BUFFER rays. */
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
index a7ecde7c80d..0ab2289348b 100644
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -29,77 +29,59 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
*/
kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
- unsigned int my_sample;
- unsigned int pixel_x;
- unsigned int pixel_y;
- unsigned int tile_x;
- unsigned int tile_y;
-
- unsigned int work_index = 0;
/* Get work. */
- if(!get_next_work(kg, &work_index, ray_index)) {
+ uint work_index;
+ if(!get_next_work(kg, ray_index, &work_index)) {
/* No more work, mark ray as inactive */
kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
return;
}
- /* Get the sample associated with the work. */
- my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-
- /* Get pixel and tile position associated with the work. */
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
- &tile_x, &tile_y,
- work_index,
- ray_index);
- kernel_split_state.work_array[ray_index] = work_index;
+ uint x, y, sample;
+ get_work_pixel(kg, work_index, &x, &y, &sample);
+ /* Remap rng_state and buffer to current pixel. */
ccl_global uint *rng_state = kernel_split_params.rng_state;
- rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
-
- ccl_global float *buffer = kernel_split_params.buffer;
- buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
+ rng_state += kernel_split_params.offset + x + y*kernel_split_params.stride;
- RNG rng = kernel_split_state.rng[ray_index];
+ /* Store buffer offset for writing to passes. */
+ uint buffer_offset = (kernel_split_params.offset + x + y*kernel_split_params.stride) * kernel_data.film.pass_stride;
+ kernel_split_state.buffer_offset[ray_index] = buffer_offset;
/* Initialize random numbers and ray. */
+ uint rng_hash;
kernel_path_trace_setup(kg,
rng_state,
- my_sample,
- pixel_x, pixel_y,
- &rng,
+ sample,
+ x, y,
+ &rng_hash,
&kernel_split_state.ray[ray_index]);
if(kernel_split_state.ray[ray_index].t != 0.0f) {
- /* Initialize throughput, L_transparent, Ray, PathState;
+ /* Initialize throughput, path radiance, Ray, PathState;
* These rays proceed with path-iteration.
*/
kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
- kernel_split_state.L_transparent[ray_index] = 0.0f;
path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
path_state_init(kg,
&kernel_split_state.sd_DL_shadow[ray_index],
&kernel_split_state.path_state[ray_index],
- &rng,
- my_sample,
+ rng_hash,
+ sample,
&kernel_split_state.ray[ray_index]);
#ifdef __SUBSURFACE__
kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
#endif
-
-#ifdef __KERNEL_DEBUG__
- debug_data_init(&kernel_split_state.debug_data[ray_index]);
-#endif
}
else {
/* These rays do not participate in path-iteration. */
float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* Accumulate result in output buffer. */
- kernel_write_pass_float4(buffer, my_sample, L_rad);
- path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]);
+ ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
+ kernel_write_pass_float4(buffer, sample, L_rad);
ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
}
- kernel_split_state.rng[ray_index] = rng;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 45984ca509b..f5378bc172b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -59,52 +59,14 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg)
return;
}
-#ifdef __KERNEL_DEBUG__
- DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
-#endif
- Intersection isect;
- PathState state = kernel_split_state.path_state[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
Ray ray = kernel_split_state.ray[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- /* intersect scene */
- uint visibility = path_state_ray_visibility(kg, &state);
-
- if(state.bounce > kernel_data.integrator.ao_bounces) {
- visibility = PATH_RAY_SHADOW;
- ray.t = kernel_data.background.ao_distance;
- }
-
-#ifdef __HAIR__
- float difl = 0.0f, extmax = 0.0f;
- uint lcg_state = 0;
- RNG rng = kernel_split_state.rng[ray_index];
-
- if(kernel_data.bvh.have_curves) {
- if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
- float3 pixdiff = ray.dD.dx + ray.dD.dy;
- /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
- difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
- }
-
- extmax = kernel_data.curve.maximum_width;
- lcg_state = lcg_state_init(&rng, state.rng_offset, state.sample, 0x51633e2d);
- }
-
- bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
- bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif
+ Intersection isect;
+ bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L);
kernel_split_state.isect[ray_index] = isect;
-#ifdef __KERNEL_DEBUG__
- if(state.flag & PATH_RAY_CAMERA) {
- debug_data->num_bvh_traversed_nodes += isect.num_traversed_nodes;
- debug_data->num_bvh_traversed_instances += isect.num_traversed_instances;
- debug_data->num_bvh_intersections += isect.num_intersections;
- }
- debug_data->num_ray_bounces++;
-#endif
-
if(!hit) {
/* Change the state of rays that hit the background;
* These rays undergo special processing in the
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index 2801b32f285..7032461b04a 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -48,30 +48,18 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg)
ccl_global char *ray_state = kernel_split_state.ray_state;
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- RNG rng = kernel_split_state.rng[ray_index];
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-#ifndef __BRANCHED_PATH__
- float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
- shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
-#else
- ShaderContext ctx = SHADER_CONTEXT_MAIN;
- float rbsdf = 0.0f;
-
- if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
- rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
-
+ shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag);
+#ifdef __BRANCHED_PATH__
+ if(kernel_data.integrator.branched) {
+ shader_merge_closures(&kernel_split_state.sd[ray_index]);
}
-
- if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
- ctx = SHADER_CONTEXT_INDIRECT;
+ else
+#endif
+ {
+ shader_prepare_closures(&kernel_split_state.sd[ray_index], state);
}
-
- shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx);
- shader_merge_closures(&kernel_split_state.sd[ray_index]);
-#endif /* __BRANCHED_PATH__ */
-
- kernel_split_state.rng[ray_index] = rng;
}
}
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
index 297decb0bc2..5a55b680695 100644
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -39,7 +39,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
ccl_local ushort *local_index = &locals->local_index[0];
/* copy to local memory */
- for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+ for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
uint idx = offset + i + lid;
uint add = input + idx;
uint value = (~0);
@@ -59,9 +59,9 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
# ifdef __KERNEL_OPENCL__
/* bitonic sort */
- for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
- for (uint inc = length; inc > 0; inc >>= 1) {
- for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+ for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+ for(uint inc = length; inc > 0; inc >>= 1) {
+ for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
uint i = lid + ii;
bool direction = ((i & (length << 1)) != 0);
uint j = i ^ inc;
@@ -81,7 +81,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
# endif /* __KERNEL_OPENCL__ */
/* copy to destination */
- for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+ for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
uint idx = offset + i + lid;
uint lidx = local_index[i + lid];
uint outi = output + idx;
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
index 474286285a9..79aa2c9435b 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -37,21 +37,18 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
float3 throughput = kernel_split_state.throughput[ray_index];
#ifdef __BRANCHED_PATH__
if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
#endif
- kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+ kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
#ifdef __BRANCHED_PATH__
}
else {
- kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput);
+ kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
}
#endif
-
- kernel_split_state.rng[ray_index] = rng;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index 78e61709b01..b52f9a5eb81 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -45,7 +45,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
ShaderData *sd = &kernel_split_state.sd[ray_index];
float3 throughput = kernel_split_state.throughput[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
@@ -75,7 +74,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
if(use_branched) {
kernel_branched_path_surface_connect_light(kg,
- &rng,
sd,
emission_sd,
state,
@@ -91,10 +89,11 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
float3 shadow;
if(!shadow_blocked(kg,
- emission_sd,
- state,
- &ray,
- &shadow))
+ sd,
+ emission_sd,
+ state,
+ &ray,
+ &shadow))
{
/* accumulate */
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
@@ -103,8 +102,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}
-
- kernel_split_state.rng[ray_index] = rng;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 08f0124b529..558d327bc76 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -63,7 +63,7 @@ ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
path_radiance_sum_indirect(L);
- path_radiance_accum_sample(orig_ray_L, L, 1);
+ path_radiance_accum_sample(orig_ray_L, L);
atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index 4bb2f0d3d80..c58c8463f5c 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -56,14 +56,6 @@ typedef struct SplitParams {
/* SPLIT_DATA_ENTRY(type, name, num) */
-#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__)
-/* DebugData memory */
-# define SPLIT_DATA_DEBUG_ENTRIES \
- SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
-#else
-# define SPLIT_DATA_DEBUG_ENTRIES
-#endif /* DEBUG */
-
#ifdef __BRANCHED_PATH__
typedef ccl_global struct SplitBranchedState {
@@ -80,7 +72,6 @@ typedef ccl_global struct SplitBranchedState {
/* indirect loop state */
int next_closure;
int next_sample;
- int num_samples;
#ifdef __SUBSURFACE__
int ss_next_closure;
@@ -122,9 +113,7 @@ typedef ccl_global struct SplitBranchedState {
#endif /* __VOLUME__ */
#define SPLIT_DATA_ENTRIES \
- SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
- SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
@@ -133,19 +122,16 @@ typedef ccl_global struct SplitBranchedState {
SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
- SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
+ SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
SPLIT_DATA_SUBSURFACE_ENTRIES \
SPLIT_DATA_VOLUME_ENTRIES \
SPLIT_DATA_BRANCHED_ENTRIES \
- SPLIT_DATA_DEBUG_ENTRIES \
/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
- SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
- SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
@@ -158,7 +144,6 @@ typedef ccl_global struct SplitBranchedState {
SPLIT_DATA_SUBSURFACE_ENTRIES \
SPLIT_DATA_VOLUME_ENTRIES \
SPLIT_DATA_BRANCHED_ENTRIES \
- SPLIT_DATA_DEBUG_ENTRIES \
/* struct that holds pointers to data in the shared state buffer */
typedef struct SplitData {
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index d5083b23f80..3b957856aea 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -38,7 +38,6 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
ShaderData *sd = &branched_state->sd;
- RNG rng = kernel_split_state.rng[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
@@ -52,14 +51,12 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
branched_state->next_closure == 0 && branched_state->next_sample == 0)
{
- branched_state->lcg_state = lcg_state_init(&rng,
- branched_state->path_state.rng_offset,
- branched_state->path_state.sample,
- 0x68bc21eb);
+ branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
+ 0x68bc21eb);
}
int num_samples = kernel_data.integrator.subsurface_samples;
float num_samples_inv = 1.0f/num_samples;
- RNG bssrdf_rng = cmj_hash(rng, i);
+ uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
/* do subsurface scatter step with copy of shader data, this will
* replace the BSSRDF with a diffuse BSDF closure */
@@ -67,7 +64,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
float bssrdf_u, bssrdf_v;
path_branched_rng_2D(kg,
- &bssrdf_rng,
+ bssrdf_rng_hash,
&branched_state->path_state,
j,
num_samples,
@@ -77,7 +74,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
/* intersection is expensive so avoid doing multiple times for the same input */
if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
- RNG lcg_state = branched_state->lcg_state;
+ uint lcg_state = branched_state->lcg_state;
SubsurfaceIntersection ss_isect_private;
branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
@@ -152,7 +149,6 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
int all = (kernel_data.integrator.sample_all_lights_direct) ||
(branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
kernel_branched_path_surface_connect_light(kg,
- &rng,
bssrdf_sd,
emission_sd,
hit_state,
@@ -229,7 +225,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- RNG rng = kernel_split_state.rng[ray_index];
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
@@ -246,7 +241,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
emission_sd,
L,
state,
- &rng,
ray,
throughput,
ss_indirect))
@@ -256,21 +250,17 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
#ifdef __BRANCHED_PATH__
}
else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
- float bssrdf_probability;
- ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+ float bssrdf_u, bssrdf_v;
+ path_state_rng_2D(kg,
+ state,
+ PRNG_BSDF_U,
+ &bssrdf_u, &bssrdf_v);
- /* modify throughput for picking bssrdf or bsdf */
- *throughput *= bssrdf_probability;
+ const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
/* do bssrdf scatter step if we picked a bssrdf closure */
if(sc) {
- uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
- float bssrdf_u, bssrdf_v;
- path_state_rng_2D(kg,
- &rng,
- state,
- PRNG_BSDF_U,
- &bssrdf_u, &bssrdf_v);
+ uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
subsurface_scatter_step(kg,
sd,
state,
@@ -290,7 +280,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
}
#endif
}
- kernel_split_state.rng[ray_index] = rng;
}
# ifdef __BRANCHED_PATH__
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 7704aa545c8..4268813b263 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -280,8 +280,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f);
float r2 = roughness * roughness;
- bsdf->alpha_x = fmaxf(0.001f, r2 / aspect);
- bsdf->alpha_y = fmaxf(0.001f, r2 * aspect);
+ bsdf->alpha_x = r2 / aspect;
+ bsdf->alpha_y = r2 * aspect;
float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx.
float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 8e45dbfa5ff..6d6e92e73f6 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,19 +16,6 @@
CCL_NAMESPACE_BEGIN
-/* Float4 textures on various devices. */
-#if defined(__KERNEL_CPU__)
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU
-#elif defined(__KERNEL_CUDA__)
-# if __CUDA_ARCH__ < 300
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA
-# else
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA_KEPLER
-# endif
-#else
-# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_OPENCL
-#endif
-
ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
{
#ifdef __KERNEL_CPU__
@@ -50,94 +37,94 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
switch(id) {
case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break;
- case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break;
- case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break;
- case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break;
- case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break;
- case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break;
- case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break;
- case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break;
- case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break;
+ case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break;
+ case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break;
+ case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break;
+ case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break;
+ case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break;
case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break;
- case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break;
- case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break;
- case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break;
- case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break;
- case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break;
- case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break;
- case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break;
case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break;
- case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break;
- case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break;
- case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break;
- case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break;
- case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break;
- case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break;
- case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break;
case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break;
- case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break;
- case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break;
- case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break;
- case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break;
- case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break;
- case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break;
- case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break;
case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break;
- case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break;
- case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break;
- case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break;
- case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break;
- case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break;
- case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break;
- case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break;
case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break;
- case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break;
- case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break;
- case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break;
- case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break;
- case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break;
- case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break;
- case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break;
case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break;
- case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break;
- case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break;
- case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break;
- case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break;
- case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break;
- case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break;
- case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break;
case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break;
- case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break;
- case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break;
- case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break;
- case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break;
- case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break;
- case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break;
- case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break;
case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break;
- case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break;
- case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break;
- case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break;
- case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break;
- case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break;
- case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break;
- case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break;
case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break;
- case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break;
- case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break;
- case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break;
- case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break;
- case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break;
- case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break;
- case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break;
case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break;
- case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break;
- case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break;
- case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break;
- case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break;
- case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break;
- case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
- case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
+ case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
+ case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break;
+ case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break;
+ case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break;
+ case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break;
+ case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break;
+ case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break;
+ case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break;
+ case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break;
+ case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break;
+ case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break;
+ case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break;
+ case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break;
+ case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break;
+ case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break;
+ case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break;
+ case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break;
+ case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break;
+ case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break;
+ case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break;
+ case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break;
+ case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break;
+ case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break;
+ case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break;
+ case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break;
+ case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break;
+ case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break;
+ case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break;
+ case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break;
+ case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break;
+ case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break;
+ case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break;
+ case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break;
+ case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break;
+ case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break;
+ case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break;
+ case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break;
+ case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break;
+ case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break;
+ case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break;
+ case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break;
+ case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break;
+ case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break;
+ case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break;
+ case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break;
+ case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break;
+ case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break;
+ case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break;
+ case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break;
+ case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break;
+ case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break;
+ case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break;
+ case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break;
+ case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break;
+ case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break;
+ case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break;
+ case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break;
+ case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break;
+ case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break;
+ case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break;
+ case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break;
+ case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break;
+ case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break;
+ case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break;
+ case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break;
+ case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break;
+ case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break;
+ case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break;
+ case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break;
+ case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break;
+ case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break;
+ case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break;
+ case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break;
default:
kernel_assert(0);
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -224,6 +211,8 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
object_inverse_normal_transform(kg, sd, &N);
/* project from direction vector to barycentric coordinates in triangles */
+ float3 signed_N = N;
+
N.x = fabsf(N.x);
N.y = fabsf(N.y);
N.z = fabsf(N.z);
@@ -293,12 +282,19 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
uint use_alpha = stack_valid(alpha_offset);
- if(weight.x > 0.0f)
- f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb, use_alpha);
- if(weight.y > 0.0f)
- f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb, use_alpha);
- if(weight.z > 0.0f)
- f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb, use_alpha);
+ /* Map so that no textures are flipped, rotation is somewhat arbitrary. */
+ if(weight.x > 0.0f) {
+ float2 uv = make_float2((signed_N.x < 0.0f)? 1.0f - co.y: co.y, co.z);
+ f += weight.x*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+ }
+ if(weight.y > 0.0f) {
+ float2 uv = make_float2((signed_N.y > 0.0f)? 1.0f - co.x: co.x, co.z);
+ f += weight.y*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+ }
+ if(weight.z > 0.0f) {
+ float2 uv = make_float2((signed_N.z > 0.0f)? 1.0f - co.y: co.y, co.x);
+ f += weight.z*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+ }
if(stack_valid(out_offset))
stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));