diff options
author | Darshan Kadu <darsh7807@gmail.com> | 2017-09-10 15:41:40 +0300 |
---|---|---|
committer | Darshan Kadu <darsh7807@gmail.com> | 2017-09-10 15:41:40 +0300 |
commit | 6594fa1ce02809a275c9cd488fa0223d03d73571 (patch) | |
tree | 0bcd95846e1e3b09239126b40ef434ed3dc3a50d /intern/cycles/kernel | |
parent | f2017083a19e5c83aadc575625dce0642ffce6c5 (diff) |
merged the master branchsoc-2017-vertex_paint
Diffstat (limited to 'intern/cycles/kernel')
110 files changed, 3091 insertions, 2648 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index bef869f34b4..b4ca16bdb48 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -45,6 +45,7 @@ set(SRC kernels/opencl/kernel_direct_lighting.cl kernels/opencl/kernel_shadow_blocked_ao.cl kernels/opencl/kernel_shadow_blocked_dl.cl + kernels/opencl/kernel_enqueue_inactive.cl kernels/opencl/kernel_next_iteration_setup.cl kernels/opencl/kernel_indirect_subsurface.cl kernels/opencl/kernel_buffer_update.cl @@ -78,7 +79,6 @@ set(SRC_HEADERS kernel_compat_cpu.h kernel_compat_cuda.h kernel_compat_opencl.h - kernel_debug.h kernel_differential.h kernel_emission.h kernel_film.h @@ -121,6 +121,10 @@ set(SRC_KERNELS_CUDA_HEADERS kernels/cuda/kernel_config.h ) +set(SRC_KERNELS_OPENCL_HEADERS + kernels/opencl/kernel_split_function.h +) + set(SRC_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -197,6 +201,7 @@ set(SRC_GEOM_HEADERS geom/geom.h geom/geom_attribute.h geom/geom_curve.h + geom/geom_curve_intersect.h geom/geom_motion_curve.h geom/geom_motion_triangle.h geom/geom_motion_triangle_intersect.h @@ -228,6 +233,7 @@ set(SRC_FILTER_HEADERS set(SRC_UTIL_HEADERS ../util/util_atomic.h ../util/util_color.h + ../util/util_defines.h ../util/util_half.h ../util/util_hash.h ../util/util_math.h @@ -278,6 +284,7 @@ set(SRC_SPLIT_HEADERS split/kernel_data_init.h split/kernel_direct_lighting.h split/kernel_do_volume.h + split/kernel_enqueue_inactive.h split/kernel_holdout_emission_blurring_pathtermination_ao.h split/kernel_indirect_background.h split/kernel_indirect_subsurface.h @@ -450,6 +457,7 @@ add_library(cycles_kernel ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_KERNELS_OPENCL_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_FILTER_HEADERS} @@ -490,9 +498,11 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_sc delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_enqueue_inactive.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 85741016b25..cf0c8542d69 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -233,7 +233,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, - int skip_object, + uint visibility, uint max_hits, uint *num_hits) { @@ -244,7 +244,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, - skip_object, + visibility, max_hits, num_hits); } @@ -253,7 +253,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, return bvh_intersect_shadow_all_motion(kg, ray, isect, - skip_object, + visibility, max_hits, num_hits); } @@ -264,7 +264,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, return bvh_intersect_shadow_all_hair(kg, ray, isect, - skip_object, + visibility, max_hits, num_hits); } @@ -275,7 +275,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, return bvh_intersect_shadow_all_instancing(kg, ray, isect, - skip_object, + visibility, max_hits, num_hits); } @@ -284,7 +284,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, return bvh_intersect_shadow_all(kg, ray, isect, - skip_object, + visibility, max_hits, num_hits); } diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index 74a9ebf14e4..6c33dad5426 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -52,8 +52,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -61,8 +61,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); dist[0] = c0min; dist[1] = c1min; @@ -101,8 +101,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -110,8 +110,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); if(difl != 0.0f) { float hdiff = 1.0f + difl; @@ -483,8 +483,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, ssef tfar_y = max(lower_y, upper_y); ssef tfar_z = max(lower_z, upper_z); - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); sseb vmask = tnear <= tfar; dist[0] = tnear.f[0]; dist[1] = tnear.f[1]; @@ -545,8 +545,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg ssef tfar_y = max(lower_y, upper_y); ssef tfar_z = max(lower_z, upper_z); - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); sseb vmask; if(difl != 0.0f) { const float round_down = 1.0f - difl; @@ -615,7 +615,7 @@ ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, const float3& P, const float3& dir, const ssef& isect_near, - const ssef& isect_far, + const ssef& isect_far, const ssef& tsplat, const ssef Psplat[3], const ssef idirsplat[3], diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index 267e098f912..a6a4353562c 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -45,7 +45,7 @@ ccl_device_inline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, - const int skip_object, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -119,7 +119,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, idir, isect_t, node_addr, - PATH_RAY_SHADOW, + visibility, dist); #else // __KERNEL_SSE2__ traverse_mask = NODE_INTERSECT(kg, @@ -134,7 +134,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, idirsplat, shufflexyz, node_addr, - PATH_RAY_SHADOW, + visibility, dist); #endif // __KERNEL_SSE2__ @@ -186,17 +186,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* primitive intersection */ while(prim_addr < prim_addr2) { kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - -#ifdef __SHADOW_TRICKS__ - uint tri_object = (object == OBJECT_NONE) - ? kernel_tex_fetch(__prim_object, prim_addr) - : object; - if(tri_object == skip_object) { - ++prim_addr; - continue; - } -#endif - bool hit; /* todo: specialized intersect functions which don't fill in @@ -209,7 +198,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, isect_array, P, dir, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -221,7 +210,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, P, dir, ray->time, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -232,30 +221,30 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, case PRIMITIVE_MOTION_CURVE: { const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } else { - hit = bvh_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } break; } @@ -402,7 +391,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, - const int skip_object, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -411,7 +400,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, - skip_object, + visibility, max_hits, num_hits); } @@ -422,7 +411,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, - skip_object, + visibility, max_hits, num_hits); } diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index c58d3b0316c..ae8f54821f2 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -244,14 +244,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, { /* shadow ray early termination */ #if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif #else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; #endif } @@ -274,14 +274,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif # else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; # endif } @@ -298,44 +298,44 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); bool hit; if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } else { - hit = bvh_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } if(hit) { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif # else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; # endif } diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h index 6d22f0b0d6a..3036efd4198 100644 --- a/intern/cycles/kernel/bvh/qbvh_nodes.h +++ b/intern/cycles/kernel/bvh/qbvh_nodes.h @@ -126,8 +126,8 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg const sseb vmask = cast(tnear) > cast(tfar); int mask = (int)movemask(vmask)^0xf; #else - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = tnear <= tfar; int mask = (int)movemask(vmask); #endif @@ -174,8 +174,8 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust( const float round_down = 1.0f - difl; const float round_up = 1.0f + difl; - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = round_down*tnear <= round_up*tfar; *dist = tnear; return (int)movemask(vmask); diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h index ce474438f2c..522213f30ca 100644 --- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h @@ -33,7 +33,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, - const int skip_object, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -107,7 +107,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(false #ifdef __VISIBILITY_FLAG__ - || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) + || ((__float_as_uint(inodes.x) & visibility) == 0) #endif #if BVH_FEATURE(BVH_MOTION) || UNLIKELY(ray->time < inodes.y) @@ -244,7 +244,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(node_addr < 0) { float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { + if((__float_as_uint(leaf.z) & visibility) == 0) { /* Pop. */ node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; @@ -268,17 +268,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Primitive intersection. */ while(prim_addr < prim_addr2) { kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - -#ifdef __SHADOW_TRICKS__ - uint tri_object = (object == OBJECT_NONE) - ? kernel_tex_fetch(__prim_object, prim_addr) - : object; - if(tri_object == skip_object) { - ++prim_addr; - continue; - } -#endif - bool hit; /* todo: specialized intersect functions which don't fill in @@ -291,7 +280,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, isect_array, P, dir, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -303,7 +292,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, P, dir, ray->time, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -314,30 +303,30 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, case PRIMITIVE_MOTION_CURVE: { const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } else { - hit = bvh_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } break; } diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h index fca75a1d416..335a4afd47a 100644 --- a/intern/cycles/kernel/bvh/qbvh_traversal.h +++ b/intern/cycles/kernel/bvh/qbvh_traversal.h @@ -340,7 +340,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, prim_addr)) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } @@ -362,7 +362,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, prim_addr)) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } @@ -379,37 +379,37 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); bool hit; if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } else { - hit = bvh_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } if(hit) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index a04c157dc40..86a00d2124d 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -423,6 +423,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: return bsdf_hair_merge(a, b); +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + return bsdf_principled_diffuse_merge(a, b); +#endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: return volume_henyey_greenstein_merge(a, b); diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 30cc8b90330..b12e248f0a3 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -288,12 +288,16 @@ ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf) return SD_BSDF|SD_BSDF_HAS_EVAL; } -ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf) +ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) { bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = bsdf->alpha_x; @@ -302,12 +306,16 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf) return SD_BSDF|SD_BSDF_HAS_EVAL; } -ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf) +ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) { bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = bsdf->alpha_x; @@ -343,12 +351,16 @@ ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf) return SD_BSDF|SD_BSDF_HAS_EVAL; } -ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf) +ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) { bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = saturate(bsdf->alpha_y); diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index b07b515c405..2f2c35d5d1f 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha) } /* Sample slope distribution (based on page 14 of the supplemental implementation). */ -ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU) +ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy) { if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) { - const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f)); - const float phi = M_2PI_F * randU.y; + const float r = sqrtf(randx / max(1.0f - randx, 1e-7f)); + const float phi = M_2PI_F * randy; return make_float2(r*cosf(phi), r*sinf(phi)); } - const float sinI = sqrtf(1.0f - cosI*cosI); + const float sinI = safe_sqrtf(1.0f - cosI*cosI); const float tanI = sinI/cosI; const float projA = 0.5f * (cosI + 1.0f); if(projA < 0.0001f) return make_float2(0.0f, 0.0f); - const float A = 2.0f*randU.x*projA / cosI - 1.0f; + const float A = 2.0f*randx*projA / cosI - 1.0f; float tmp = A*A-1.0f; if(fabsf(tmp) < 1e-7f) return make_float2(0.0f, 0.0f); @@ -64,24 +64,24 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2; float U2; - if(randU.y >= 0.5f) - U2 = 2.0f*(randU.y - 0.5f); + if(randy >= 0.5f) + U2 = 2.0f*(randy - 0.5f); else - U2 = 2.0f*(0.5f - randU.y); + U2 = 2.0f*(0.5f - randy); const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f); const float slopeY = z * sqrtf(1.0f + slopeX*slopeX); - if(randU.y >= 0.5f) + if(randy >= 0.5f) return make_float2(slopeX, slopeY); else return make_float2(slopeX, -slopeY); } /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */ -ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU) +ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy) { const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z)); - const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU); + const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy); const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f)); const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y); @@ -245,35 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r) return saturate(albedo); } +ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior) +{ + if(ior < 1.0f) { + ior = 1.0f/ior; + } + a = saturate(a); + ior = clamp(ior, 1.0f, 3.0f); + float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f; + float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f; + float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior); + float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f; + + return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f); +} + ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha) { float D = D_ggx(normalize(wi+wo), alpha); float lambda = mf_lambda(wi, make_float2(alpha, alpha)); + float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f); + + float multiscatter = wo.z * M_1_PI_F; + float albedo = mf_ggx_albedo(alpha); - return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z; + return albedo*singlescatter + (1.0f - albedo)*multiscatter; } ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha) { - return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z; + float D = D_ggx_aniso(normalize(wi+wo), alpha); + float lambda = mf_lambda(wi, alpha); + float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f); + + float multiscatter = wo.z * M_1_PI_F; + + float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y)); + return albedo*singlescatter + (1.0f - albedo)*multiscatter; } ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta) { - float3 wh; - float fresnel; - if(wi.z*wo.z > 0.0f) { - wh = normalize(wi + wo); - fresnel = fresnel_dielectric_cos(dot(wi, wh), eta); - } - else { - wh = normalize(wi + wo*eta); - fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta); - } + bool reflective = (wi.z*wo.z > 0.0f); + + float wh_len; + float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len); if(wh.z < 0.0f) wh = -wh; float3 r_wi = (wi.z < 0.0f)? -wi: wi; - return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z); + float lambda = mf_lambda(r_wi, make_float2(alpha, alpha)); + float D = D_ggx(wh, alpha); + float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta); + + float multiscatter = fabsf(wo.z * M_1_PI_F); + if(reflective) { + float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f); + float albedo = mf_ggx_albedo(alpha); + return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter); + } + else { + float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f)); + float albedo = mf_ggx_transmission_albedo(alpha, eta); + return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter); + } } /* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */ @@ -326,13 +360,17 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf) return bsdf_microfacet_multi_ggx_common_setup(bsdf); } -ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf) +ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) { if(is_zero(bsdf->T)) bsdf->T = make_float3(1.0f, 0.0f, 0.0f); bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID; + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + return bsdf_microfacet_multi_ggx_common_setup(bsdf); } @@ -345,12 +383,16 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf) return bsdf_microfacet_multi_ggx_common_setup(bsdf); } -ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf) +ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) { bsdf->alpha_y = bsdf->alpha_x; bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID; + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + return bsdf_microfacet_multi_ggx_common_setup(bsdf); } @@ -432,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC *eval *= *pdf; *omega_in = X*localO.x + Y*localO.y + Z*localO.z; + #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx; *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy; @@ -455,7 +498,7 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf) return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } -ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf) +ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) { bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f); bsdf->alpha_y = bsdf->alpha_x; @@ -469,6 +512,10 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsd bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID; + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index 2eb2457c9e5..e73915dbda7 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -100,11 +100,14 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( bool outside = true; for(int order = 0; order < 10; order++) { - /* Sample microfacet height and normal */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) + /* Sample microfacet height. */ + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) break; - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); + /* Sample microfacet normal. */ + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); #ifdef MF_MULTI_GLASS if(order == 0 && use_fresnel) { @@ -136,7 +139,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( #ifdef MF_MULTI_GLASS bool next_outside; float3 wi_prev = -wr; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { outside = !outside; wr = -wr; @@ -204,14 +208,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)( int order; for(order = 0; order < 10; order++) { /* Sample microfacet height. */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) { + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) { /* The random walk has left the surface. */ *wo = outside? wr: -wr; return throughput; } /* Sample microfacet normal. */ - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); /* First-bounce color is already accounted for in mix weight. */ if(!use_fresnel && order > 0) @@ -221,7 +227,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)( #ifdef MF_MULTI_GLASS bool next_outside; float3 wi_prev = -wr; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { hr = -hr; wr = -wr; diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h index 215c32e1ffb..f8ca64293b0 100644 --- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h @@ -58,6 +58,14 @@ ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf) return SD_BSDF|SD_BSDF_HAS_EVAL; } +ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b) +{ + const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a; + const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b; + + return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness); +} + ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h index 53d703de143..6226ed2c2ef 100644 --- a/intern/cycles/kernel/filter/filter_features.h +++ b/intern/cycles/kernel/filter/filter_features.h @@ -78,16 +78,10 @@ ccl_device_inline void filter_calculate_scale(float *scale) scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); } -ccl_device_inline float3 filter_get_pixel_color(const ccl_global float *ccl_restrict buffer, - int pass_stride) +ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer, + int pass_stride) { - return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)); -} - -ccl_device_inline float filter_get_pixel_variance(const ccl_global float *ccl_restrict buffer, - int pass_stride) -{ - return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2))); + return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10)); } ccl_device_inline void design_row_add(float *design_row, diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index 3185330994c..3ddd8712266 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) +#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride) /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. * pixel_buffer always points to the first of the 4 current pixel in the first pass. @@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN #define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - __m128 y4 = _mm_set1_ps(pixel.y); \ + float4 y4 = make_float4(pixel.y); \ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ - __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); #define END_FOR_PIXEL_WINDOW_SSE } \ pixel_buffer += buffer_w - (pixel.x - low.x); \ } -ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, - __m128 active_pixels, +ccl_device_inline void filter_get_features_sse(float4 x, float4 y, + int4 active_pixels, const float *ccl_restrict buffer, - __m128 *features, - const __m128 *ccl_restrict mean, + float4 *features, + const float4 *ccl_restrict mean, int pass_stride) { features[0] = x; features[1] = y; - features[2] = _mm_fabs_ps(ccl_get_feature_sse(0)); + features[2] = fabs(ccl_get_feature_sse(0)); features[3] = ccl_get_feature_sse(1); features[4] = ccl_get_feature_sse(2); features[5] = ccl_get_feature_sse(3); @@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, features[9] = ccl_get_feature_sse(7); if(mean) { for(int i = 0; i < DENOISE_FEATURES; i++) - features[i] = _mm_sub_ps(features[i], mean[i]); + features[i] = features[i] - mean[i]; } for(int i = 0; i < DENOISE_FEATURES; i++) - features[i] = _mm_mask_ps(features[i], active_pixels); + features[i] = mask(active_pixels, features[i]); } -ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, - __m128 active_pixels, +ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, + int4 active_pixels, const float *ccl_restrict buffer, - __m128 *scales, - const __m128 *ccl_restrict mean, + float4 *scales, + const float4 *ccl_restrict mean, int pass_stride) { - scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); - scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); - - scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels); - - __m128 diff, scale; - diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); - scale = _mm_mul_ps(diff, diff); - diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - scales[3] = _mm_mask_ps(scale, active_pixels); - - scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); - - diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); - scale = _mm_mul_ps(diff, diff); - diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - scales[5] = _mm_mask_ps(scale, active_pixels); + scales[0] = fabs(x - mean[0]); + scales[1] = fabs(y - mean[1]); + scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); + scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + + sqr(ccl_get_feature_sse(2) - mean[4]) + + sqr(ccl_get_feature_sse(3) - mean[5]); + scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); + scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + + sqr(ccl_get_feature_sse(6) - mean[8]) + + sqr(ccl_get_feature_sse(7) - mean[9]); + for(int i = 0; i < 6; i++) + scales[i] = mask(active_pixels, scales[i]); } -ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) +ccl_device_inline void filter_calculate_scale_sse(float4 *scale) { - scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); - scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); - scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); - scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); - - scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); - scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); + scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); + scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); + scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); + scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); + scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); } diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 5cb4038bc33..5e989331bc2 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen int w, int f) { -#ifdef __KERNEL_SSE3__ - int aligned_lowx = (rect.x & ~(3)); - int aligned_highx = ((rect.z + 3) & ~(3)); -#endif + int aligned_lowx = rect.x / 4; + int aligned_highx = (rect.z + 3) / 4; for(int y = rect.y; y < rect.w; y++) { const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); @@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen out_image[y*w+x] = 0.0f; } for(int y1 = low; y1 < high; y1++) { -#ifdef __KERNEL_SSE3__ - for(int x = aligned_lowx; x < aligned_highx; x+=4) { - _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x))); + float4* out_image4 = (float4*)(out_image + y*w); + float4* difference_image4 = (float4*)(difference_image + y1*w); + for(int x = aligned_lowx; x < aligned_highx; x++) { + out_image4[x] += difference_image4[x]; } -#else - for(int x = rect.x; x < rect.z; x++) { - out_image[y*w+x] += difference_image[y1*w+x]; - } -#endif } for(int x = rect.x; x < rect.z; x++) { out_image[y*w+x] *= 1.0f/(high - low); @@ -101,7 +95,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d for(int x = rect.x; x < rect.z; x++) { const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); - out_image[y*w+x] = expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f)); + out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f)); } } } @@ -133,8 +127,6 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, const float *ccl_restrict difference_image, const float *ccl_restrict buffer, - float *color_pass, - float *variance_pass, float *transform, int *rank, float *XtWX, @@ -167,7 +159,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, dx, dy, w, h, pass_stride, buffer, - color_pass, variance_pass, l_transform, l_rank, weight, l_XtWX, l_XtWY, 0); } diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index 078c5f56763..2c5ac807051 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -66,7 +66,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, sum += difference_image[y*w+x1]; } sum *= 1.0f/(high-low); - out_image[y*w+x] = expf(-max(sum, 0.0f)); + out_image[y*w+x] = fast_expf(-max(sum, 0.0f)); } ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, @@ -97,8 +97,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, int dx, int dy, const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, - ccl_global float *color_pass, - ccl_global float *variance_pass, const ccl_global float *ccl_restrict transform, ccl_global int *rank, ccl_global float *XtWX, @@ -130,7 +128,6 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, dx, dy, w, h, pass_stride, buffer, - color_pass, variance_pass, transform, rank, weight, XtWX, XtWY, localIdx); diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index 82cc36625ec..2aeb54a62be 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -61,8 +61,8 @@ ccl_device void kernel_filter_divide_shadow(int sample, varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); } - varA /= (odd_sample - 1); - varB /= (even_sample - 1); + varA /= max(odd_sample - 1, 1); + varB /= max(even_sample - 1, 1); sampleVariance[idx] = 0.5f*(varA + varB) / sample; sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample); @@ -96,11 +96,17 @@ ccl_device void kernel_filter_get_feature(int sample, int idx = (y-rect.y)*buffer_w + (x - rect.x); mean[idx] = center_buffer[m_offset] / sample; - if(use_split_variance) { - variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + if(sample > 1) { + if(use_split_variance) { + variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + } + else { + variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); + } } else { - variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; } } @@ -114,41 +120,57 @@ ccl_device void kernel_filter_detect_outliers(int x, int y, { int buffer_w = align_up(rect.z - rect.x, 4); - int n = 0; - float values[25]; - for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { - for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { - int idx = (y1-rect.y)*buffer_w + (x1-rect.x); - float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); - - /* Find the position of L. */ - int i; - for(i = 0; i < n; i++) { - if(values[i] > L) break; - } - /* Make space for L by shifting all following values to the right. */ - for(int j = n; j > i; j--) { - values[j] = values[j-1]; - } - /* Insert L. */ - values[i] = L; - n++; - } - } - int idx = (y-rect.y)*buffer_w + (x-rect.x); - float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]); - float ref = 2.0f*values[(int)(n*0.75f)]; float fac = 1.0f; - if(L > ref) { - /* If the pixel is an outlier, negate the depth value to mark it as one. - * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + if(color.x < 0.0f || color.y < 0.0f || color.z < 0.0f) { depth[idx] = -depth[idx]; - fac = ref/L; - variance[idx ] *= fac*fac; - variance[idx + pass_stride] *= fac*fac; - variance[idx+2*pass_stride] *= fac*fac; + fac = 0.0f; + } + else { + float L = average(color); + int n = 0; + float values[25]; + for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { + for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { + int idx = (y1-rect.y)*buffer_w + (x1-rect.x); + float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + + /* Find the position of L. */ + int i; + for(i = 0; i < n; i++) { + if(values[i] > L) break; + } + /* Make space for L by shifting all following values to the right. */ + for(int j = n; j > i; j--) { + values[j] = values[j-1]; + } + /* Insert L. */ + values[i] = L; + n++; + } + } + + float ref = 2.0f*values[(int)(n*0.75f)]; + if(L > ref) { + /* The pixel appears to be an outlier. + * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel + * should actually be at the reference value: + * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. + * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. + */ + float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]))); + if(L - 3*stddev < ref) { + /* The pixel is an outlier, so negate the depth value to mark it as one. + * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + depth[idx] = -depth[idx]; + fac = ref/L; + variance[idx ] *= fac*fac; + variance[idx + pass_stride] *= fac*fac; + variance[idx+2*pass_stride] *= fac*fac; + } + } } out[idx ] = fac*image[idx]; out[idx + pass_stride] = fac*image[idx + pass_stride]; diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index 4a4c81b7ba3..25a3025056c 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -22,8 +22,6 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, int w, int h, int pass_stride, const ccl_global float *ccl_restrict buffer, - ccl_global float *color_pass, - ccl_global float *variance_pass, const ccl_global float *ccl_restrict transform, ccl_global int *rank, float weight, @@ -31,38 +29,31 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, ccl_global float3 *XtWY, int localIdx) { + if(weight < 1e-3f) { + return; + } + int p_offset = y *w + x; int q_offset = (y+dy)*w + (x+dx); -#ifdef __KERNEL_CPU__ - const int stride = 1; - (void)storage_stride; - (void)localIdx; - float design_row[DENOISE_FEATURES+1]; -#elif defined(__KERNEL_CUDA__) +#ifdef __KERNEL_GPU__ const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; +#endif + +#ifdef __KERNEL_CUDA__ ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE]; ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1); #else - const int stride = storage_stride; float design_row[DENOISE_FEATURES+1]; #endif - float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride); - float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride); + float3 q_color = filter_get_color(buffer + q_offset, pass_stride); - float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride)); - float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride)); - - /* If the pixel was flagged as an outlier during prefiltering, skip it. - * Otherwise, perform the regular confidence interval test unless - * the center pixel is an outlier (in that case, using the confidence - * interval test could result in no pixels being used at all). */ - bool p_outlier = (ccl_get_feature(buffer + p_offset, 0) < 0.0f); - bool q_outlier = (ccl_get_feature(buffer + q_offset, 0) < 0.0f); - bool outside_of_interval = (average(fabs(p_color - q_color)) > 2.0f*(p_std_dev + q_std_dev + 1e-3f)); - - if(q_outlier || (!p_outlier && outside_of_interval)) { + /* If the pixel was flagged as an outlier during prefiltering, skip it. */ + if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) { return; } @@ -83,13 +74,19 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, int4 buffer_params, int sample) { -#ifdef __KERNEL_CPU__ - const int stride = 1; - (void)storage_stride; -#else +#ifdef __KERNEL_GPU__ const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; #endif + if(XtWX[0] < 1e-3f) { + /* There is not enough information to determine a denoised result. + * As a fallback, keep the original value of the pixel. */ + return; + } + /* The weighted average of pixel colors (essentially, the NLM-filtered image). * In case the solution of the linear model fails due to numerical issues, * fall back to this value. */ @@ -102,6 +99,9 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, final_color = mean_color; } + /* Clamp pixel value to positive values. */ + final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; final_color *= sample; if(buffer_params.w) { @@ -114,6 +114,4 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, combined_buffer[2] = final_color.z; } -#undef STORAGE_TYPE - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index 30dc2969b11..9e65f61664b 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff { int buffer_w = align_up(rect.z - rect.x, 4); - __m128 features[DENOISE_FEATURES]; + float4 features[DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; int2 pixel; @@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff min(rect.w, y + radius + 1)); int num_pixels = (high.y - low.y) * (high.x - low.x); - __m128 feature_means[DENOISE_FEATURES]; + float4 feature_means[DENOISE_FEATURES]; math_vector_zero_sse(feature_means, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); math_vector_add_sse(feature_means, DENOISE_FEATURES, features); } END_FOR_PIXEL_WINDOW_SSE - __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels); + float4 pixel_scale = make_float4(1.0f / num_pixels); for(int i = 0; i < DENOISE_FEATURES; i++) { - feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale); + feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; } - __m128 feature_scale[DENOISE_FEATURES]; + float4 feature_scale[DENOISE_FEATURES]; math_vector_zero_sse(feature_scale, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); @@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff filter_calculate_scale_sse(feature_scale); - __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f)); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f)); } END_FOR_PIXEL_WINDOW_SSE float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; @@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff /* Bake the feature scaling into the transformation matrix. */ for(int i = 0; i < DENOISE_FEATURES; i++) { - math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank); + math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank); } } diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h index c623e3490fd..f34b77ebc07 100644 --- a/intern/cycles/kernel/geom/geom.h +++ b/intern/cycles/kernel/geom/geom.h @@ -27,6 +27,7 @@ #include "kernel/geom/geom_motion_triangle_shader.h" #include "kernel/geom/geom_motion_curve.h" #include "kernel/geom/geom_curve.h" +#include "kernel/geom/geom_curve_intersect.h" #include "kernel/geom/geom_volume.h" #include "kernel/geom/geom_primitive.h" diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 5c3b0ee3c15..e35267f02bf 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -16,18 +16,13 @@ CCL_NAMESPACE_BEGIN /* Curve Primitive * - * Curve primitive for rendering hair and fur. These can be render as flat ribbons - * or curves with actual thickness. The curve can also be rendered as line segments - * rather than curves for better performance */ + * Curve primitive for rendering hair and fur. These can be render as flat + * ribbons or curves with actual thickness. The curve can also be rendered as + * line segments rather than curves for better performance. + */ #ifdef __HAIR__ -#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300) -# define ccl_device_curveintersect ccl_device -#else -# define ccl_device_curveintersect ccl_device_forceinline -#endif - /* Reading attributes on various curve elements */ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) @@ -151,7 +146,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd /* Curve tangent normal */ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd) -{ +{ float3 tgN = make_float3(0.0f,0.0f,0.0f); if(sd->type & PRIMITIVE_ALL_CURVE) { @@ -219,893 +214,6 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, } } -#ifdef __KERNEL_SSE2__ -ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) -{ - return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); -} -#endif - -#ifdef __KERNEL_SSE2__ -/* Pass P and dir by reference to aligned vector */ -ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) -#else -ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) -#endif -{ - const bool is_curve_primitive = (type & PRIMITIVE_CURVE); - - if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { - const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); - if(time < prim_time.x || time > prim_time.y) { - return false; - } - } - - int segment = PRIMITIVE_UNPACK_SEGMENT(type); - float epsilon = 0.0f; - float r_st, r_en; - - int depth = kernel_data.curve.subdivisions; - int flags = kernel_data.curve.curveflags; - int prim = kernel_tex_fetch(__prim_index, curveAddr); - -#ifdef __KERNEL_SSE2__ - ssef vdir = load4f(dir); - ssef vcurve_coef[4]; - const float3 *curve_coef = (float3 *)vcurve_coef; - - { - ssef dtmp = vdir * vdir; - ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); - ssef rd_ss = load1f_first(1.0f) / d_ss; - - ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); - int2 &v00 = (int2 &)v00vec; - - int k0 = v00.x + segment; - int k1 = k0 + 1; - int ka = max(k0 - 1, v00.x); - int kb = min(k1 + 1, v00.x + v00.y - 1); - -#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) - avxf P_curve_0_1, P_curve_2_3; - if(is_curve_primitive) { - P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); - P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x); - } - else { - int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; - motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3); - } -#else /* __KERNEL_AVX2__ */ - ssef P_curve[4]; - - if(is_curve_primitive) { - P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); - P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); - P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); - P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); - } -#endif /* __KERNEL_AVX2__ */ - - ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); - ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; - ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; - ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); - ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); - - ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); - ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); - ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); - -#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) - const avxf vPP = _mm256_broadcast_ps(&P.m128); - const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); - const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); - const avxf htfm22 = avxf(htfm2.m128, htfm2.m128); - - const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP), - htfm00, - madd(shuffle<1>(P_curve_0_1 - vPP), - htfm11, - shuffle<2>(P_curve_0_1 - vPP) * htfm22)); - const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP), - htfm00, - madd(shuffle<1>(P_curve_2_3 - vPP), - htfm11, - shuffle<2>(P_curve_2_3 - vPP)*htfm22)); - - const ssef p0 = _mm256_castps256_ps128(p01); - const ssef p1 = _mm256_extractf128_ps(p01, 1); - const ssef p2 = _mm256_castps256_ps128(p23); - const ssef p3 = _mm256_extractf128_ps(p23, 1); - - const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1); - r_st = ((float4 &)P_curve_1).w; - const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3); - r_en = ((float4 &)P_curve_2).w; -#else /* __KERNEL_AVX2__ */ - ssef htfm[] = { htfm0, htfm1, htfm2 }; - ssef vP = load4f(P); - ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); - ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); - ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); - ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); - - r_st = ((float4 &)P_curve[1]).w; - r_en = ((float4 &)P_curve[2]).w; -#endif /* __KERNEL_AVX2__ */ - - float fc = 0.71f; - ssef vfc = ssef(fc); - ssef vfcxp3 = vfc * p3; - - vcurve_coef[0] = p1; - vcurve_coef[1] = vfc * (p2 - p0); - vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); - vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); - - } -#else - float3 curve_coef[4]; - - /* curve Intersection check */ - /* obtain curve parameters */ - { - /* ray transform created - this should be created at beginning of intersection loop */ - Transform htfm; - float d = sqrtf(dir.x * dir.x + dir.z * dir.z); - htfm = make_transform( - dir.z / d, 0, -dir.x /d, 0, - -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0, - dir.x, dir.y, dir.z, 0, - 0, 0, 0, 1); - - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + segment; - int k1 = k0 + 1; - - int ka = max(k0 - 1,__float_as_int(v00.x)); - int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P_curve[4]; - - if(is_curve_primitive) { - P_curve[0] = kernel_tex_fetch(__curve_keys, ka); - P_curve[1] = kernel_tex_fetch(__curve_keys, k0); - P_curve[2] = kernel_tex_fetch(__curve_keys, k1); - P_curve[3] = kernel_tex_fetch(__curve_keys, kb); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve); - } - - float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P); - float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P); - float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P); - float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P); - - float fc = 0.71f; - curve_coef[0] = p1; - curve_coef[1] = -fc*p0 + fc*p2; - curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; - curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; - r_st = P_curve[1].w; - r_en = P_curve[2].w; - } -#endif - - float r_curr = max(r_st, r_en); - - if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) - epsilon = 2 * r_curr; - - /* find bounds - this is slow for cubic curves */ - float upper, lower; - - float zextrem[4]; - curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); - if(lower - r_curr > isect->t || upper + r_curr < epsilon) - return false; - - /* minimum width extension */ - float mw_extension = min(difl * fabsf(upper), extmax); - float r_ext = mw_extension + r_curr; - - float xextrem[4]; - curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); - if(lower > r_ext || upper < -r_ext) - return false; - - float yextrem[4]; - curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); - if(lower > r_ext || upper < -r_ext) - return false; - - /* setup recurrent loop */ - int level = 1 << depth; - int tree = 0; - float resol = 1.0f / (float)level; - bool hit = false; - - /* begin loop */ - while(!(tree >> (depth))) { - const float i_st = tree * resol; - const float i_en = i_st + (level * resol); - -#ifdef __KERNEL_SSE2__ - ssef vi_st = ssef(i_st), vi_en = ssef(i_en); - ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); - ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); - - ssef vbmin = min(vp_st, vp_en); - ssef vbmax = max(vp_st, vp_en); - - float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; - float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; - float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; - float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; -#else - float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0]; - float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0]; - - float bminx = min(p_st.x, p_en.x); - float bmaxx = max(p_st.x, p_en.x); - float bminy = min(p_st.y, p_en.y); - float bmaxy = max(p_st.y, p_en.y); - float bminz = min(p_st.z, p_en.z); - float bmaxz = max(p_st.z, p_en.z); -#endif - - if(xextrem[0] >= i_st && xextrem[0] <= i_en) { - bminx = min(bminx,xextrem[1]); - bmaxx = max(bmaxx,xextrem[1]); - } - if(xextrem[2] >= i_st && xextrem[2] <= i_en) { - bminx = min(bminx,xextrem[3]); - bmaxx = max(bmaxx,xextrem[3]); - } - if(yextrem[0] >= i_st && yextrem[0] <= i_en) { - bminy = min(bminy,yextrem[1]); - bmaxy = max(bmaxy,yextrem[1]); - } - if(yextrem[2] >= i_st && yextrem[2] <= i_en) { - bminy = min(bminy,yextrem[3]); - bmaxy = max(bmaxy,yextrem[3]); - } - if(zextrem[0] >= i_st && zextrem[0] <= i_en) { - bminz = min(bminz,zextrem[1]); - bmaxz = max(bmaxz,zextrem[1]); - } - if(zextrem[2] >= i_st && zextrem[2] <= i_en) { - bminz = min(bminz,zextrem[3]); - bmaxz = max(bmaxz,zextrem[3]); - } - - float r1 = r_st + (r_en - r_st) * i_st; - float r2 = r_st + (r_en - r_st) * i_en; - r_curr = max(r1, r2); - - mw_extension = min(difl * fabsf(bmaxz), extmax); - float r_ext = mw_extension + r_curr; - float coverage = 1.0f; - - if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { - /* the bounding box does not overlap the square centered at O */ - tree += level; - level = tree & -tree; - } - else if(level == 1) { - - /* the maximum recursion depth is reached. - * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. - * dP* is reversed if necessary.*/ - float t = isect->t; - float u = 0.0f; - float gd = 0.0f; - - if(flags & CURVE_KN_RIBBONS) { - float3 tg = (p_en - p_st); -#ifdef __KERNEL_SSE__ - const float3 tg_sq = tg * tg; - float w = tg_sq.x + tg_sq.y; -#else - float w = tg.x * tg.x + tg.y * tg.y; -#endif - if(w == 0) { - tree++; - level = tree & -tree; - continue; - } -#ifdef __KERNEL_SSE__ - const float3 p_sttg = p_st * tg; - w = -(p_sttg.x + p_sttg.y) / w; -#else - w = -(p_st.x * tg.x + p_st.y * tg.y) / w; -#endif - w = saturate(w); - - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - r_curr = r_st + (r_en - r_st) * u; - /* compare x-y distances */ - float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if(dot(tg, dp_st)< 0) - dp_st *= -1; - if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { - tree++; - level = tree & -tree; - continue; - } - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if(dot(tg, dp_en) < 0) - dp_en *= -1; - if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { - tree++; - level = tree & -tree; - continue; - } - - /* compute coverage */ - float r_ext = r_curr; - coverage = 1.0f; - if(difl != 0.0f) { - mw_extension = min(difl * fabsf(bmaxz), extmax); - r_ext = mw_extension + r_curr; -#ifdef __KERNEL_SSE__ - const float3 p_curr_sq = p_curr * p_curr; - const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128))); - float d = dxxx.x; -#else - float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); -#endif - float d0 = d - r_curr; - float d1 = d + r_curr; - float inv_mw_extension = 1.0f/mw_extension; - if(d0 >= 0) - coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f; - else // inside - coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f; - } - - if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { - tree++; - level = tree & -tree; - continue; - } - - t = p_curr.z; - - /* stochastic fade from minimum width */ - if(difl != 0.0f && lcg_state) { - if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) - return hit; - } - } - else { - float l = len(p_en - p_st); - /* minimum width extension */ - float or1 = r1; - float or2 = r2; - - if(difl != 0.0f) { - mw_extension = min(len(p_st - P) * difl, extmax); - or1 = r1 < mw_extension ? mw_extension : r1; - mw_extension = min(len(p_en - P) * difl, extmax); - or2 = r2 < mw_extension ? mw_extension : r2; - } - /* --- */ - float invl = 1.0f/l; - float3 tg = (p_en - p_st) * invl; - gd = (or2 - or1) * invl; - float difz = -dot(p_st,tg); - float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd)); - float invcyla = 1.0f/cyla; - float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1))); - float tcentre = -halfb*invcyla; - float zcentre = difz + (tg.z * tcentre); - float3 tdif = - p_st; - tdif.z += tcentre; - float tdifz = dot(tdif,tg); - float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); - float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; - float td = tb*tb - 4*cyla*tc; - if(td < 0.0f) { - tree++; - level = tree & -tree; - continue; - } - - float rootd = sqrtf(td); - float correction = (-tb - rootd) * 0.5f * invcyla; - t = tcentre + correction; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if(dot(tg, dp_st)< 0) - dp_st *= -1; - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if(dot(tg, dp_en) < 0) - dp_en *= -1; - - if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { - correction = (-tb + rootd) * 0.5f * invcyla; - t = tcentre + correction; - } - - if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { - tree++; - level = tree & -tree; - continue; - } - - float w = (zcentre + (tg.z * correction)) * invl; - w = saturate(w); - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - - /* stochastic fade from minimum width */ - if(difl != 0.0f && lcg_state) { - r_curr = r1 + (r2 - r1) * w; - r_ext = or1 + (or2 - or1) * w; - coverage = r_curr/r_ext; - - if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) - return hit; - } - } - /* we found a new intersection */ - -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -#endif - { - /* record intersection */ - isect->t = t; - isect->u = u; - isect->v = gd; - isect->prim = curveAddr; - isect->object = object; - isect->type = type; - hit = true; - } - - tree++; - level = tree & -tree; - } - else { - /* split the curve into two curves and process */ - level = level >> 1; - } - } - - return hit; -} - -ccl_device_curveintersect bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) -{ - /* define few macros to minimize code duplication for SSE */ -#ifndef __KERNEL_SSE2__ -# define len3_squared(x) len_squared(x) -# define len3(x) len(x) -# define dot3(x, y) dot(x, y) -#endif - - const bool is_curve_primitive = (type & PRIMITIVE_CURVE); - - if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { - const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); - if(time < prim_time.x || time > prim_time.y) { - return false; - } - } - - int segment = PRIMITIVE_UNPACK_SEGMENT(type); - /* curve Intersection check */ - int flags = kernel_data.curve.curveflags; - - int prim = kernel_tex_fetch(__prim_index, curveAddr); - float4 v00 = kernel_tex_fetch(__curves, prim); - - int cnum = __float_as_int(v00.x); - int k0 = cnum + segment; - int k1 = k0 + 1; - -#ifndef __KERNEL_SSE2__ - float4 P_curve[2]; - - if(is_curve_primitive) { - P_curve[0] = kernel_tex_fetch(__curve_keys, k0); - P_curve[1] = kernel_tex_fetch(__curve_keys, k1); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve); - } - - float or1 = P_curve[0].w; - float or2 = P_curve[1].w; - float3 p1 = float4_to_float3(P_curve[0]); - float3 p2 = float4_to_float3(P_curve[1]); - - /* minimum width extension */ - float r1 = or1; - float r2 = or2; - float3 dif = P - p1; - float3 dif_second = P - p2; - if(difl != 0.0f) { - float pixelsize = min(len3(dif) * difl, extmax); - r1 = or1 < pixelsize ? pixelsize : or1; - pixelsize = min(len3(dif_second) * difl, extmax); - r2 = or2 < pixelsize ? pixelsize : or2; - } - /* --- */ - - float3 p21_diff = p2 - p1; - float3 sphere_dif1 = (dif + dif_second) * 0.5f; - float3 dir = direction; - float sphere_b_tmp = dot3(dir, sphere_dif1); - float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; -#else - ssef P_curve[2]; - - if(is_curve_primitive) { - P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); - P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve); - } - - const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); - - ssef r12 = or12; - const ssef vP = load4f(P); - const ssef dif = vP - P_curve[0]; - const ssef dif_second = vP - P_curve[1]; - if(difl != 0.0f) { - const ssef len1_sq = len3_squared_splat(dif); - const ssef len2_sq = len3_squared_splat(dif_second); - const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); - const ssef pixelsize12 = min(len12 * difl, ssef(extmax)); - r12 = max(or12, pixelsize12); - } - float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12)); - float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); - - const ssef p21_diff = P_curve[1] - P_curve[0]; - const ssef sphere_dif1 = (dif + dif_second) * 0.5f; - const ssef dir = load4f(direction); - const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); - const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1); -#endif - - float mr = max(r1, r2); - float l = len3(p21_diff); - float invl = 1.0f / l; - float sp_r = mr + 0.5f * l; - - float sphere_b = dot3(dir, sphere_dif2); - float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; - - if(sdisc < 0.0f) - return false; - - /* obtain parameters and test midpoint distance for suitable modes */ -#ifndef __KERNEL_SSE2__ - float3 tg = p21_diff * invl; -#else - const ssef tg = p21_diff * invl; -#endif - float gd = (r2 - r1) * invl; - - float dirz = dot3(dir, tg); - float difz = dot3(dif, tg); - - float a = 1.0f - (dirz*dirz*(1 + gd*gd)); - - float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1)); - - float tcentre = -halfb/a; - float zcentre = difz + (dirz * tcentre); - - if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) - return false; - if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) - return false; - - /* test minimum separation */ -#ifndef __KERNEL_SSE2__ - float3 cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross(tg, dif)); -#else - const ssef cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross_zxy(tg, dif)); -#endif - float cprodsq = len3_squared(cprod); - float distscaled = dot3(cprod, dif); - - if(cprodsq == 0) - distscaled = cprod2sq; - else - distscaled = (distscaled*distscaled)/cprodsq; - - if(distscaled > mr*mr) - return false; - - /* calculate true intersection */ -#ifndef __KERNEL_SSE2__ - float3 tdif = dif + tcentre * dir; -#else - const ssef tdif = madd(ssef(tcentre), dir, dif); -#endif - float tdifz = dot3(tdif, tg); - float tdifma = tdifz*gd + r1; - float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma)); - float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; - float td = tb*tb - 4*a*tc; - - if(td < 0.0f) - return false; - - float rootd = 0.0f; - float correction = 0.0f; - if(flags & CURVE_KN_ACCURATE) { - rootd = sqrtf(td); - correction = ((-tb - rootd)/(2*a)); - } - - float t = tcentre + correction; - - if(t < isect->t) { - - if(flags & CURVE_KN_INTERSECTCORRECTION) { - rootd = sqrtf(td); - correction = ((-tb - rootd)/(2*a)); - t = tcentre + correction; - } - - float z = zcentre + (dirz * correction); - // bool backface = false; - - if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { - // backface = true; - correction = ((-tb + rootd)/(2*a)); - t = tcentre + correction; - z = zcentre + (dirz * correction); - } - - /* stochastic fade from minimum width */ - float adjradius = or1 + z * (or2 - or1) * invl; - adjradius = adjradius / (r1 + z * gd); - if(lcg_state && adjradius != 1.0f) { - if(lcg_step_float(lcg_state) > adjradius) - return false; - } - /* --- */ - - if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { - - if(flags & CURVE_KN_ENCLOSEFILTER) { - float enc_ratio = 1.01f; - if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { - float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); - float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; - if(a2*c2 < 0.0f) - return false; - } - } - -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -#endif - { - /* record intersection */ - isect->t = t; - isect->u = z*invl; - isect->v = gd; - isect->prim = curveAddr; - isect->object = object; - isect->type = type; - - return true; - } - } - } - - return false; - -#ifndef __KERNEL_SSE2__ -# undef len3_squared -# undef len3 -# undef dot3 -#endif -} - -ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float fc = 0.71f; - float data[4]; - float t2 = t * t; - data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; - data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; - data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; - data[3] = 3.0f * fc * t2 - 2.0f * fc * t; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float data[4]; - float fc = 0.71f; - float t2 = t * t; - float t3 = t2 * t; - data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; - data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; - data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; - data[3] = fc * t3 - fc * t2; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray) -{ - int flag = kernel_data.curve.curveflags; - float t = isect->t; - float3 P = ray->P; - float3 D = ray->D; - - if(isect->object != OBJECT_NONE) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - D = transform_direction(&tfm, D*t); - D = normalize_len(D, &t); - } - - int prim = kernel_tex_fetch(__prim_index, isect->prim); - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); - int k1 = k0 + 1; - - float3 tg; - - if(flag & CURVE_KN_INTERPOLATE) { - int ka = max(k0 - 1,__float_as_int(v00.x)); - int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P_curve[4]; - - if(sd->type & PRIMITIVE_CURVE) { - P_curve[0] = kernel_tex_fetch(__curve_keys, ka); - P_curve[1] = kernel_tex_fetch(__curve_keys, k0); - P_curve[2] = kernel_tex_fetch(__curve_keys, k1); - P_curve[3] = kernel_tex_fetch(__curve_keys, kb); - } - else { - motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); - } - - float3 p[4]; - p[0] = float4_to_float3(P_curve[0]); - p[1] = float4_to_float3(P_curve[1]); - p[2] = float4_to_float3(P_curve[2]); - p[3] = float4_to_float3(P_curve[3]); - - P = P + D*t; - -#ifdef __UV__ - sd->u = isect->u; - sd->v = 0.0f; -#endif - - tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); - - if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { - sd->Ng = normalize(-(D - tg * (dot(tg, D)))); - } - else { - /* direction from inside to surface of curve */ - float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); - sd->Ng = normalize(P - p_curr); - - /* adjustment for changing radius */ - float gd = isect->v; - - if(gd != 0.0f) { - sd->Ng = sd->Ng - gd * tg; - sd->Ng = normalize(sd->Ng); - } - } - - /* todo: sometimes the normal is still so that this is detected as - * backfacing even if cull backfaces is enabled */ - - sd->N = sd->Ng; - } - else { - float4 P_curve[2]; - - if(sd->type & PRIMITIVE_CURVE) { - P_curve[0]= kernel_tex_fetch(__curve_keys, k0); - P_curve[1]= kernel_tex_fetch(__curve_keys, k1); - } - else { - motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); - } - - float l = 1.0f; - tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l); - - P = P + D*t; - - float3 dif = P - float4_to_float3(P_curve[0]); - -#ifdef __UV__ - sd->u = dot(dif,tg)/l; - sd->v = 0.0f; -#endif - - if(flag & CURVE_KN_TRUETANGENTGNORMAL) { - sd->Ng = -(D - tg * dot(tg, D)); - sd->Ng = normalize(sd->Ng); - } - else { - float gd = isect->v; - - /* direction from inside to surface of curve */ - sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd); - - /* adjustment for changing radius */ - if(gd != 0.0f) { - sd->Ng = sd->Ng - gd * tg; - sd->Ng = normalize(sd->Ng); - } - } - - sd->N = sd->Ng; - } - -#ifdef __DPDU__ - /* dPdu/dPdv */ - sd->dPdu = tg; - sd->dPdv = cross(tg, sd->Ng); -#endif - - if(isect->object != OBJECT_NONE) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - } - - return P; -} - -#endif +#endif /* __HAIR__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h new file mode 100644 index 00000000000..e9a149ea1ab --- /dev/null +++ b/intern/cycles/kernel/geom/geom_curve_intersect.h @@ -0,0 +1,934 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Curve primitive intersection functions. */ + +#ifdef __HAIR__ + +#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300) +# define ccl_device_curveintersect ccl_device +#else +# define ccl_device_curveintersect ccl_device_forceinline +#endif + +#ifdef __KERNEL_SSE2__ +ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) +{ + return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); +} +#endif + +/* On CPU pass P and dir by reference to aligned vector. */ +ccl_device_curveintersect bool cardinal_curve_intersect( + KernelGlobals *kg, + Intersection *isect, + const float3 ccl_ref P, + const float3 ccl_ref dir, + uint visibility, + int object, + int curveAddr, + float time, + int type, + uint *lcg_state, + float difl, + float extmax) +{ + const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + + if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { + const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); + if(time < prim_time.x || time > prim_time.y) { + return false; + } + } + + int segment = PRIMITIVE_UNPACK_SEGMENT(type); + float epsilon = 0.0f; + float r_st, r_en; + + int depth = kernel_data.curve.subdivisions; + int flags = kernel_data.curve.curveflags; + int prim = kernel_tex_fetch(__prim_index, curveAddr); + +#ifdef __KERNEL_SSE2__ + ssef vdir = load4f(dir); + ssef vcurve_coef[4]; + const float3 *curve_coef = (float3 *)vcurve_coef; + + { + ssef dtmp = vdir * vdir; + ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); + ssef rd_ss = load1f_first(1.0f) / d_ss; + + ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); + int2 &v00 = (int2 &)v00vec; + + int k0 = v00.x + segment; + int k1 = k0 + 1; + int ka = max(k0 - 1, v00.x); + int kb = min(k1 + 1, v00.x + v00.y - 1); + +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) + avxf P_curve_0_1, P_curve_2_3; + if(is_curve_primitive) { + P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); + P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x); + } + else { + int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; + motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3); + } +#else /* __KERNEL_AVX2__ */ + ssef P_curve[4]; + + if(is_curve_primitive) { + P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); + P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); + } +#endif /* __KERNEL_AVX2__ */ + + ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); + ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; + ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; + ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); + ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); + + ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); + ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); + ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); + +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) + const avxf vPP = _mm256_broadcast_ps(&P.m128); + const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); + const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); + const avxf htfm22 = avxf(htfm2.m128, htfm2.m128); + + const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP), + htfm00, + madd(shuffle<1>(P_curve_0_1 - vPP), + htfm11, + shuffle<2>(P_curve_0_1 - vPP) * htfm22)); + const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP), + htfm00, + madd(shuffle<1>(P_curve_2_3 - vPP), + htfm11, + shuffle<2>(P_curve_2_3 - vPP)*htfm22)); + + const ssef p0 = _mm256_castps256_ps128(p01); + const ssef p1 = _mm256_extractf128_ps(p01, 1); + const ssef p2 = _mm256_castps256_ps128(p23); + const ssef p3 = _mm256_extractf128_ps(p23, 1); + + const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1); + r_st = ((float4 &)P_curve_1).w; + const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3); + r_en = ((float4 &)P_curve_2).w; +#else /* __KERNEL_AVX2__ */ + ssef htfm[] = { htfm0, htfm1, htfm2 }; + ssef vP = load4f(P); + ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); + ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); + ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); + ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); + + r_st = ((float4 &)P_curve[1]).w; + r_en = ((float4 &)P_curve[2]).w; +#endif /* __KERNEL_AVX2__ */ + + float fc = 0.71f; + ssef vfc = ssef(fc); + ssef vfcxp3 = vfc * p3; + + vcurve_coef[0] = p1; + vcurve_coef[1] = vfc * (p2 - p0); + vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); + vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); + + } +#else + float3 curve_coef[4]; + + /* curve Intersection check */ + /* obtain curve parameters */ + { + /* ray transform created - this should be created at beginning of intersection loop */ + Transform htfm; + float d = sqrtf(dir.x * dir.x + dir.z * dir.z); + htfm = make_transform( + dir.z / d, 0, -dir.x /d, 0, + -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0, + dir.x, dir.y, dir.z, 0, + 0, 0, 0, 1); + + float4 v00 = kernel_tex_fetch(__curves, prim); + + int k0 = __float_as_int(v00.x) + segment; + int k1 = k0 + 1; + + int ka = max(k0 - 1,__float_as_int(v00.x)); + int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); + + float4 P_curve[4]; + + if(is_curve_primitive) { + P_curve[0] = kernel_tex_fetch(__curve_keys, ka); + P_curve[1] = kernel_tex_fetch(__curve_keys, k0); + P_curve[2] = kernel_tex_fetch(__curve_keys, k1); + P_curve[3] = kernel_tex_fetch(__curve_keys, kb); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve); + } + + float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P); + float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P); + float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P); + float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P); + + float fc = 0.71f; + curve_coef[0] = p1; + curve_coef[1] = -fc*p0 + fc*p2; + curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; + curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; + r_st = P_curve[1].w; + r_en = P_curve[2].w; + } +#endif + + float r_curr = max(r_st, r_en); + + if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) + epsilon = 2 * r_curr; + + /* find bounds - this is slow for cubic curves */ + float upper, lower; + + float zextrem[4]; + curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); + if(lower - r_curr > isect->t || upper + r_curr < epsilon) + return false; + + /* minimum width extension */ + float mw_extension = min(difl * fabsf(upper), extmax); + float r_ext = mw_extension + r_curr; + + float xextrem[4]; + curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); + if(lower > r_ext || upper < -r_ext) + return false; + + float yextrem[4]; + curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); + if(lower > r_ext || upper < -r_ext) + return false; + + /* setup recurrent loop */ + int level = 1 << depth; + int tree = 0; + float resol = 1.0f / (float)level; + bool hit = false; + + /* begin loop */ + while(!(tree >> (depth))) { + const float i_st = tree * resol; + const float i_en = i_st + (level * resol); + +#ifdef __KERNEL_SSE2__ + ssef vi_st = ssef(i_st), vi_en = ssef(i_en); + ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); + ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); + + ssef vbmin = min(vp_st, vp_en); + ssef vbmax = max(vp_st, vp_en); + + float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; + float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; + float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; + float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; +#else + float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0]; + float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0]; + + float bminx = min(p_st.x, p_en.x); + float bmaxx = max(p_st.x, p_en.x); + float bminy = min(p_st.y, p_en.y); + float bmaxy = max(p_st.y, p_en.y); + float bminz = min(p_st.z, p_en.z); + float bmaxz = max(p_st.z, p_en.z); +#endif + + if(xextrem[0] >= i_st && xextrem[0] <= i_en) { + bminx = min(bminx,xextrem[1]); + bmaxx = max(bmaxx,xextrem[1]); + } + if(xextrem[2] >= i_st && xextrem[2] <= i_en) { + bminx = min(bminx,xextrem[3]); + bmaxx = max(bmaxx,xextrem[3]); + } + if(yextrem[0] >= i_st && yextrem[0] <= i_en) { + bminy = min(bminy,yextrem[1]); + bmaxy = max(bmaxy,yextrem[1]); + } + if(yextrem[2] >= i_st && yextrem[2] <= i_en) { + bminy = min(bminy,yextrem[3]); + bmaxy = max(bmaxy,yextrem[3]); + } + if(zextrem[0] >= i_st && zextrem[0] <= i_en) { + bminz = min(bminz,zextrem[1]); + bmaxz = max(bmaxz,zextrem[1]); + } + if(zextrem[2] >= i_st && zextrem[2] <= i_en) { + bminz = min(bminz,zextrem[3]); + bmaxz = max(bmaxz,zextrem[3]); + } + + float r1 = r_st + (r_en - r_st) * i_st; + float r2 = r_st + (r_en - r_st) * i_en; + r_curr = max(r1, r2); + + mw_extension = min(difl * fabsf(bmaxz), extmax); + float r_ext = mw_extension + r_curr; + float coverage = 1.0f; + + if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { + /* the bounding box does not overlap the square centered at O */ + tree += level; + level = tree & -tree; + } + else if(level == 1) { + + /* the maximum recursion depth is reached. + * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. + * dP* is reversed if necessary.*/ + float t = isect->t; + float u = 0.0f; + float gd = 0.0f; + + if(flags & CURVE_KN_RIBBONS) { + float3 tg = (p_en - p_st); +#ifdef __KERNEL_SSE__ + const float3 tg_sq = tg * tg; + float w = tg_sq.x + tg_sq.y; +#else + float w = tg.x * tg.x + tg.y * tg.y; +#endif + if(w == 0) { + tree++; + level = tree & -tree; + continue; + } +#ifdef __KERNEL_SSE__ + const float3 p_sttg = p_st * tg; + w = -(p_sttg.x + p_sttg.y) / w; +#else + w = -(p_st.x * tg.x + p_st.y * tg.y) / w; +#endif + w = saturate(w); + + /* compute u on the curve segment */ + u = i_st * (1 - w) + i_en * w; + r_curr = r_st + (r_en - r_st) * u; + /* compare x-y distances */ + float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; + + float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; + if(dot(tg, dp_st)< 0) + dp_st *= -1; + if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { + tree++; + level = tree & -tree; + continue; + } + float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; + if(dot(tg, dp_en) < 0) + dp_en *= -1; + if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { + tree++; + level = tree & -tree; + continue; + } + + /* compute coverage */ + float r_ext = r_curr; + coverage = 1.0f; + if(difl != 0.0f) { + mw_extension = min(difl * fabsf(bmaxz), extmax); + r_ext = mw_extension + r_curr; +#ifdef __KERNEL_SSE__ + const float3 p_curr_sq = p_curr * p_curr; + const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128))); + float d = dxxx.x; +#else + float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); +#endif + float d0 = d - r_curr; + float d1 = d + r_curr; + float inv_mw_extension = 1.0f/mw_extension; + if(d0 >= 0) + coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f; + else // inside + coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f; + } + + if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { + tree++; + level = tree & -tree; + continue; + } + + t = p_curr.z; + + /* stochastic fade from minimum width */ + if(difl != 0.0f && lcg_state) { + if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) + return hit; + } + } + else { + float l = len(p_en - p_st); + /* minimum width extension */ + float or1 = r1; + float or2 = r2; + + if(difl != 0.0f) { + mw_extension = min(len(p_st - P) * difl, extmax); + or1 = r1 < mw_extension ? mw_extension : r1; + mw_extension = min(len(p_en - P) * difl, extmax); + or2 = r2 < mw_extension ? mw_extension : r2; + } + /* --- */ + float invl = 1.0f/l; + float3 tg = (p_en - p_st) * invl; + gd = (or2 - or1) * invl; + float difz = -dot(p_st,tg); + float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd)); + float invcyla = 1.0f/cyla; + float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1))); + float tcentre = -halfb*invcyla; + float zcentre = difz + (tg.z * tcentre); + float3 tdif = - p_st; + tdif.z += tcentre; + float tdifz = dot(tdif,tg); + float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); + float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; + float td = tb*tb - 4*cyla*tc; + if(td < 0.0f) { + tree++; + level = tree & -tree; + continue; + } + + float rootd = sqrtf(td); + float correction = (-tb - rootd) * 0.5f * invcyla; + t = tcentre + correction; + + float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; + if(dot(tg, dp_st)< 0) + dp_st *= -1; + float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; + if(dot(tg, dp_en) < 0) + dp_en *= -1; + + if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { + correction = (-tb + rootd) * 0.5f * invcyla; + t = tcentre + correction; + } + + if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { + tree++; + level = tree & -tree; + continue; + } + + float w = (zcentre + (tg.z * correction)) * invl; + w = saturate(w); + /* compute u on the curve segment */ + u = i_st * (1 - w) + i_en * w; + + /* stochastic fade from minimum width */ + if(difl != 0.0f && lcg_state) { + r_curr = r1 + (r2 - r1) * w; + r_ext = or1 + (or2 - or1) * w; + coverage = r_curr/r_ext; + + if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) + return hit; + } + } + /* we found a new intersection */ + +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) +#endif + { + /* record intersection */ + isect->t = t; + isect->u = u; + isect->v = gd; + isect->prim = curveAddr; + isect->object = object; + isect->type = type; + hit = true; + } + + tree++; + level = tree & -tree; + } + else { + /* split the curve into two curves and process */ + level = level >> 1; + } + } + + return hit; +} + +ccl_device_curveintersect bool curve_intersect(KernelGlobals *kg, + Intersection *isect, + float3 P, + float3 direction, + uint visibility, + int object, + int curveAddr, + float time, + int type, + uint *lcg_state, + float difl, + float extmax) +{ + /* define few macros to minimize code duplication for SSE */ +#ifndef __KERNEL_SSE2__ +# define len3_squared(x) len_squared(x) +# define len3(x) len(x) +# define dot3(x, y) dot(x, y) +#endif + + const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + + if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { + const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); + if(time < prim_time.x || time > prim_time.y) { + return false; + } + } + + int segment = PRIMITIVE_UNPACK_SEGMENT(type); + /* curve Intersection check */ + int flags = kernel_data.curve.curveflags; + + int prim = kernel_tex_fetch(__prim_index, curveAddr); + float4 v00 = kernel_tex_fetch(__curves, prim); + + int cnum = __float_as_int(v00.x); + int k0 = cnum + segment; + int k1 = k0 + 1; + +#ifndef __KERNEL_SSE2__ + float4 P_curve[2]; + + if(is_curve_primitive) { + P_curve[0] = kernel_tex_fetch(__curve_keys, k0); + P_curve[1] = kernel_tex_fetch(__curve_keys, k1); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve); + } + + float or1 = P_curve[0].w; + float or2 = P_curve[1].w; + float3 p1 = float4_to_float3(P_curve[0]); + float3 p2 = float4_to_float3(P_curve[1]); + + /* minimum width extension */ + float r1 = or1; + float r2 = or2; + float3 dif = P - p1; + float3 dif_second = P - p2; + if(difl != 0.0f) { + float pixelsize = min(len3(dif) * difl, extmax); + r1 = or1 < pixelsize ? pixelsize : or1; + pixelsize = min(len3(dif_second) * difl, extmax); + r2 = or2 < pixelsize ? pixelsize : or2; + } + /* --- */ + + float3 p21_diff = p2 - p1; + float3 sphere_dif1 = (dif + dif_second) * 0.5f; + float3 dir = direction; + float sphere_b_tmp = dot3(dir, sphere_dif1); + float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; +#else + ssef P_curve[2]; + + if(is_curve_primitive) { + P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve); + } + + const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); + + ssef r12 = or12; + const ssef vP = load4f(P); + const ssef dif = vP - P_curve[0]; + const ssef dif_second = vP - P_curve[1]; + if(difl != 0.0f) { + const ssef len1_sq = len3_squared_splat(dif); + const ssef len2_sq = len3_squared_splat(dif_second); + const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); + const ssef pixelsize12 = min(len12 * difl, ssef(extmax)); + r12 = max(or12, pixelsize12); + } + float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12)); + float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); + + const ssef p21_diff = P_curve[1] - P_curve[0]; + const ssef sphere_dif1 = (dif + dif_second) * 0.5f; + const ssef dir = load4f(direction); + const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); + const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1); +#endif + + float mr = max(r1, r2); + float l = len3(p21_diff); + float invl = 1.0f / l; + float sp_r = mr + 0.5f * l; + + float sphere_b = dot3(dir, sphere_dif2); + float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; + + if(sdisc < 0.0f) + return false; + + /* obtain parameters and test midpoint distance for suitable modes */ +#ifndef __KERNEL_SSE2__ + float3 tg = p21_diff * invl; +#else + const ssef tg = p21_diff * invl; +#endif + float gd = (r2 - r1) * invl; + + float dirz = dot3(dir, tg); + float difz = dot3(dif, tg); + + float a = 1.0f - (dirz*dirz*(1 + gd*gd)); + + float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1)); + + float tcentre = -halfb/a; + float zcentre = difz + (dirz * tcentre); + + if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) + return false; + if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) + return false; + + /* test minimum separation */ +#ifndef __KERNEL_SSE2__ + float3 cprod = cross(tg, dir); + float cprod2sq = len3_squared(cross(tg, dif)); +#else + const ssef cprod = cross(tg, dir); + float cprod2sq = len3_squared(cross_zxy(tg, dif)); +#endif + float cprodsq = len3_squared(cprod); + float distscaled = dot3(cprod, dif); + + if(cprodsq == 0) + distscaled = cprod2sq; + else + distscaled = (distscaled*distscaled)/cprodsq; + + if(distscaled > mr*mr) + return false; + + /* calculate true intersection */ +#ifndef __KERNEL_SSE2__ + float3 tdif = dif + tcentre * dir; +#else + const ssef tdif = madd(ssef(tcentre), dir, dif); +#endif + float tdifz = dot3(tdif, tg); + float tdifma = tdifz*gd + r1; + float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma)); + float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; + float td = tb*tb - 4*a*tc; + + if(td < 0.0f) + return false; + + float rootd = 0.0f; + float correction = 0.0f; + if(flags & CURVE_KN_ACCURATE) { + rootd = sqrtf(td); + correction = ((-tb - rootd)/(2*a)); + } + + float t = tcentre + correction; + + if(t < isect->t) { + + if(flags & CURVE_KN_INTERSECTCORRECTION) { + rootd = sqrtf(td); + correction = ((-tb - rootd)/(2*a)); + t = tcentre + correction; + } + + float z = zcentre + (dirz * correction); + // bool backface = false; + + if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { + // backface = true; + correction = ((-tb + rootd)/(2*a)); + t = tcentre + correction; + z = zcentre + (dirz * correction); + } + + /* stochastic fade from minimum width */ + float adjradius = or1 + z * (or2 - or1) * invl; + adjradius = adjradius / (r1 + z * gd); + if(lcg_state && adjradius != 1.0f) { + if(lcg_step_float(lcg_state) > adjradius) + return false; + } + /* --- */ + + if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { + + if(flags & CURVE_KN_ENCLOSEFILTER) { + float enc_ratio = 1.01f; + if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { + float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); + float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; + if(a2*c2 < 0.0f) + return false; + } + } + +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) +#endif + { + /* record intersection */ + isect->t = t; + isect->u = z*invl; + isect->v = gd; + isect->prim = curveAddr; + isect->object = object; + isect->type = type; + + return true; + } + } + } + + return false; + +#ifndef __KERNEL_SSE2__ +# undef len3_squared +# undef len3 +# undef dot3 +#endif +} + +ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) +{ + float fc = 0.71f; + float data[4]; + float t2 = t * t; + data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; + data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; + data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; + data[3] = 3.0f * fc * t2 - 2.0f * fc * t; + return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; +} + +ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) +{ + float data[4]; + float fc = 0.71f; + float t2 = t * t; + float t3 = t2 * t; + data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; + data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; + data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; + data[3] = fc * t3 - fc * t2; + return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; +} + +ccl_device_inline float3 curve_refine(KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray) +{ + int flag = kernel_data.curve.curveflags; + float t = isect->t; + float3 P = ray->P; + float3 D = ray->D; + + if(isect->object != OBJECT_NONE) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_itfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + D = transform_direction(&tfm, D*t); + D = normalize_len(D, &t); + } + + int prim = kernel_tex_fetch(__prim_index, isect->prim); + float4 v00 = kernel_tex_fetch(__curves, prim); + + int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); + int k1 = k0 + 1; + + float3 tg; + + if(flag & CURVE_KN_INTERPOLATE) { + int ka = max(k0 - 1,__float_as_int(v00.x)); + int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); + + float4 P_curve[4]; + + if(sd->type & PRIMITIVE_CURVE) { + P_curve[0] = kernel_tex_fetch(__curve_keys, ka); + P_curve[1] = kernel_tex_fetch(__curve_keys, k0); + P_curve[2] = kernel_tex_fetch(__curve_keys, k1); + P_curve[3] = kernel_tex_fetch(__curve_keys, kb); + } + else { + motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); + } + + float3 p[4]; + p[0] = float4_to_float3(P_curve[0]); + p[1] = float4_to_float3(P_curve[1]); + p[2] = float4_to_float3(P_curve[2]); + p[3] = float4_to_float3(P_curve[3]); + + P = P + D*t; + +#ifdef __UV__ + sd->u = isect->u; + sd->v = 0.0f; +#endif + + tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); + + if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { + sd->Ng = normalize(-(D - tg * (dot(tg, D)))); + } + else { + /* direction from inside to surface of curve */ + float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); + sd->Ng = normalize(P - p_curr); + + /* adjustment for changing radius */ + float gd = isect->v; + + if(gd != 0.0f) { + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); + } + } + + /* todo: sometimes the normal is still so that this is detected as + * backfacing even if cull backfaces is enabled */ + + sd->N = sd->Ng; + } + else { + float4 P_curve[2]; + + if(sd->type & PRIMITIVE_CURVE) { + P_curve[0]= kernel_tex_fetch(__curve_keys, k0); + P_curve[1]= kernel_tex_fetch(__curve_keys, k1); + } + else { + motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); + } + + float l = 1.0f; + tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l); + + P = P + D*t; + + float3 dif = P - float4_to_float3(P_curve[0]); + +#ifdef __UV__ + sd->u = dot(dif,tg)/l; + sd->v = 0.0f; +#endif + + if(flag & CURVE_KN_TRUETANGENTGNORMAL) { + sd->Ng = -(D - tg * dot(tg, D)); + sd->Ng = normalize(sd->Ng); + } + else { + float gd = isect->v; + + /* direction from inside to surface of curve */ + sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd); + + /* adjustment for changing radius */ + if(gd != 0.0f) { + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); + } + } + + sd->N = sd->Ng; + } + +#ifdef __DPDU__ + /* dPdu/dPdv */ + sd->dPdu = tg; + sd->dPdv = cross(tg, sd->Ng); +#endif + + if(isect->object != OBJECT_NONE) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_tfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + } + + return P; +} + +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index 6ecdfe0173a..1ffc143be34 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -415,12 +415,7 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir) ccl_device_inline float3 bvh_inverse_direction(float3 dir) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 return rcp(dir); -#else - return 1.0f / dir; -#endif } /* Transform ray into object space to enter static object in BVH */ diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 1e0ef5201c9..698cd6b03fd 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -35,10 +35,10 @@ ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z) float4 r; switch(id) { case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break; - case 1: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_001, x, y, z); break; - case 2: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_002, x, y, z); break; - case 3: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_003, x, y, z); break; - case 4: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_004, x, y, z); break; + case 8: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_008, x, y, z); break; + case 16: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_016, x, y, z); break; + case 24: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_024, x, y, z); break; + case 32: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_032, x, y, z); break; } return r; } diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 06728415c15..82d3c153bf5 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -21,6 +21,9 @@ CCL_NAMESPACE_BEGIN * BSDF evaluation result, split per BSDF type. This is used to accumulate * render passes separately. */ +ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, + const ShaderData *sd); + ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 value, int use_light_pass) { #ifdef __PASSES__ @@ -205,6 +208,7 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); + L->transparent = 0.0f; L->emission = make_float3(0.0f, 0.0f, 0.0f); L->background = make_float3(0.0f, 0.0f, 0.0f); L->ao = make_float3(0.0f, 0.0f, 0.0f); @@ -214,20 +218,31 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) else #endif { + L->transparent = 0.0f; L->emission = make_float3(0.0f, 0.0f, 0.0f); } #ifdef __SHADOW_TRICKS__ L->path_total = make_float3(0.0f, 0.0f, 0.0f); L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f); - L->shadow_color = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_radiance_sum = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_throughput = 0.0f; + L->shadow_transparency = 1.0f; #endif #ifdef __DENOISING_FEATURES__ L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f); L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f); L->denoising_depth = 0.0f; -#endif /* __DENOISING_FEATURES__ */ +#endif + +#ifdef __KERNEL_DEBUG__ + L->debug_data.num_bvh_traversed_nodes = 0; + L->debug_data.num_bvh_traversed_instances = 0; + L->debug_data.num_bvh_intersections = 0; + L->debug_data.num_ray_bounces = 0; +#endif } ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, @@ -396,10 +411,11 @@ ccl_device_inline void path_radiance_accum_total_light( #endif } -ccl_device_inline void path_radiance_accum_background(PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput, - float3 value) +ccl_device_inline void path_radiance_accum_background( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 value) { #ifdef __PASSES__ if(L->use_light_pass) { @@ -419,9 +435,7 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, #ifdef __SHADOW_TRICKS__ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { L->path_total += throughput * value; - if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { - L->path_total_shaded += throughput * value; - } + L->path_total_shaded += throughput * value * L->shadow_transparency; } #endif @@ -621,25 +635,43 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance { float fac = 1.0f/num_samples; +#ifdef __SPLIT_KERNEL__ +# define safe_float3_add(f, v) \ + do { \ + ccl_global float *p = (ccl_global float*)(&(f)); \ + atomic_add_and_fetch_float(p+0, (v).x); \ + atomic_add_and_fetch_float(p+1, (v).y); \ + atomic_add_and_fetch_float(p+2, (v).z); \ + } while(0) +#else +# define safe_float3_add(f, v) (f) += (v) +#endif /* __SPLIT_KERNEL__ */ + #ifdef __PASSES__ - L->direct_diffuse += L_sample->direct_diffuse*fac; - L->direct_glossy += L_sample->direct_glossy*fac; - L->direct_transmission += L_sample->direct_transmission*fac; - L->direct_subsurface += L_sample->direct_subsurface*fac; - L->direct_scatter += L_sample->direct_scatter*fac; - - L->indirect_diffuse += L_sample->indirect_diffuse*fac; - L->indirect_glossy += L_sample->indirect_glossy*fac; - L->indirect_transmission += L_sample->indirect_transmission*fac; - L->indirect_subsurface += L_sample->indirect_subsurface*fac; - L->indirect_scatter += L_sample->indirect_scatter*fac; - - L->background += L_sample->background*fac; - L->ao += L_sample->ao*fac; - L->shadow += L_sample->shadow*fac; + safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac); + safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac); + safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac); + safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac); + safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac); + + safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac); + safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac); + safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac); + safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac); + safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac); + + safe_float3_add(L->background, L_sample->background*fac); + safe_float3_add(L->ao, L_sample->ao*fac); + safe_float3_add(L->shadow, L_sample->shadow*fac); +# ifdef __SPLIT_KERNEL__ + atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac); +# else L->mist += L_sample->mist*fac; -#endif - L->emission += L_sample->emission * fac; +# endif /* __SPLIT_KERNEL__ */ +#endif /* __PASSES__ */ + safe_float3_add(L->emission, L_sample->emission*fac); + +#undef safe_float3_add } #ifdef __SHADOW_TRICKS__ @@ -651,7 +683,7 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L) if(path_total != 0.0f) { return path_total_shaded / path_total; } - return 1.0f; + return L->shadow_transparency; } /* Calculate final light sum and transparency for shadow catcher object. */ @@ -662,11 +694,12 @@ ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg, const float shadow = path_radiance_sum_shadow(L); float3 L_sum; if(kernel_data.background.transparent) { - *alpha = 1.0f-shadow; - L_sum = make_float3(0.0f, 0.0f, 0.0f); + *alpha = 1.0f - L->shadow_throughput * shadow; + L_sum = L->shadow_radiance_sum; } else { - L_sum = L->shadow_color * shadow; + L_sum = L->shadow_background_color * L->shadow_throughput * shadow + + L->shadow_radiance_sum; } return L_sum; } diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index f18d145f7cf..8af1187213d 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, - RNG rng, + uint rng_hash, int pass_filter, int sample) { @@ -48,11 +48,11 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, path_radiance_init(&L_sample, kernel_data.film.use_light_pass); /* init path state */ - path_state_init(kg, &emission_sd, &state, &rng, sample, NULL); + path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL); /* evaluate surface shader */ - float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF); - shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); + float rbsdf = path_state_rng_1D(kg, &state, PRNG_BSDF); + shader_eval_surface(kg, sd, &state, rbsdf, state.flag); /* TODO, disable more closures we don't need besides transparent */ shader_bsdf_disable_transparency(kg, sd); @@ -64,7 +64,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample ambient occlusion */ if(pass_filter & BAKE_FILTER_AO) { - kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput, shader_bsdf_alpha(kg, sd)); + kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput, shader_bsdf_alpha(kg, sd)); } /* sample emission */ @@ -86,7 +86,6 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, &emission_sd, &L_sample, &state, - &rng, &ray, &throughput, &ss_indirect)) @@ -101,7 +100,6 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, kernel_path_indirect(kg, &indirect_sd, &emission_sd, - &rng, &ray, throughput, state.num_samples, @@ -116,14 +114,14 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample light and BSDF */ if(!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) { - kernel_path_surface_connect_light(kg, &rng, sd, &emission_sd, throughput, &state, &L_sample); + kernel_path_surface_connect_light(kg, sd, &emission_sd, throughput, &state, &L_sample); - if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) { + if(kernel_path_surface_bounce(kg, sd, &throughput, &state, &L_sample, &ray)) { #ifdef __LAMP_MIS__ state.ray_t = 0.0f; #endif /* compute indirect light */ - kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample); + kernel_path_indirect(kg, &indirect_sd, &emission_sd, &ray, throughput, 1, &state, &L_sample); /* sum and reset indirect light pass variables for the next samples */ path_radiance_sum_indirect(&L_sample); @@ -137,7 +135,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample ambient occlusion */ if(pass_filter & BAKE_FILTER_AO) { - kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput); + kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput); } /* sample emission */ @@ -151,7 +149,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) { /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd, - &emission_sd, &L_sample, &state, &rng, &ray, throughput); + &emission_sd, &L_sample, &state, &ray, throughput); } #endif @@ -161,13 +159,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* direct light */ if(kernel_data.integrator.use_direct_light) { int all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_surface_connect_light(kg, &rng, + kernel_branched_path_surface_connect_light(kg, sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all); } #endif /* indirect light */ - kernel_branched_path_surface_indirect_light(kg, &rng, + kernel_branched_path_surface_indirect_light(kg, sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample); } } @@ -225,7 +223,6 @@ ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg, ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, ShaderData *sd, - RNG *rng, PathState *state, float3 direct, float3 indirect, @@ -245,12 +242,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, } else { /* surface color of the pass only */ - shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, state, 0.0f, 0); return kernel_bake_shader_bsdf(kg, sd, type); } } else { - shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, state, 0.0f, 0); color = kernel_bake_shader_bsdf(kg, sd, type); } @@ -292,14 +289,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, int num_samples = kernel_data.integrator.aa_samples; /* random number generator */ - RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed); + uint rng_hash = cmj_hash(offset + i, kernel_data.integrator.seed); float filter_x, filter_y; if(sample == 0) { filter_x = filter_y = 0.5f; } else { - path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y); + path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y); } /* subpixel u/v offset */ @@ -335,18 +332,18 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, /* light passes if we need more than color */ if(pass_filter & ~BAKE_FILTER_COLOR) - compute_light_pass(kg, &sd, &L, rng, pass_filter, sample); + compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample); switch(type) { /* data passes */ case SHADER_EVAL_NORMAL: { if((sd.flag & SD_HAS_BUMP)) { - shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, &sd, &state, 0.f, 0); } - /* compression: normal = (2 * color) - 1 */ - out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f); + /* encoding: normal = (2 * color) - 1 */ + out = shader_bsdf_average_normal(kg, &sd) * 0.5f + make_float3(0.5f, 0.5f, 0.5f); break; } case SHADER_EVAL_UV: @@ -356,7 +353,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, } case SHADER_EVAL_EMISSION: { - shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_EMISSION); + shader_eval_surface(kg, &sd, &state, 0.f, 0); out = shader_emissive_eval(kg, &sd); break; } @@ -409,7 +406,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_diffuse, L.indirect_diffuse, @@ -421,7 +417,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_glossy, L.indirect_glossy, @@ -433,7 +428,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_transmission, L.indirect_transmission, @@ -446,7 +440,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, #ifdef __SUBSURFACE__ out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_subsurface, L.indirect_subsurface, @@ -480,7 +473,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, /* evaluate */ int flag = 0; /* we can't know which type of BSDF this is for */ - out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN); + out = shader_eval_background(kg, &sd, &state, flag); break; } default: @@ -524,7 +517,7 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, /* evaluate */ float3 P = sd.P; - shader_eval_displacement(kg, &sd, &state, SHADER_CONTEXT_MAIN); + shader_eval_displacement(kg, &sd, &state); out = sd.P - P; object_inverse_dir_transform(kg, &sd, &out); @@ -552,7 +545,7 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, /* evaluate */ int flag = 0; /* we can't know which type of BSDF this is for */ - out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN); + out = shader_eval_background(kg, &sd, &state, flag); } /* write output */ diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 38708f7ff0b..1e2af9de8b3 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -38,11 +38,15 @@ /* Qualifier wrappers for different names on different devices */ #define ccl_device __device__ __inline__ +#if __CUDA_ARCH__ < 300 +# define ccl_device_inline __device__ __inline__ # define ccl_device_forceinline __device__ __forceinline__ -#if __CUDA_ARCH__ < 500 +#elif __CUDA_ARCH__ < 500 # define ccl_device_inline __device__ __forceinline__ +# define ccl_device_forceinline __device__ __forceinline__ #else # define ccl_device_inline __device__ __inline__ +# define ccl_device_forceinline __device__ __forceinline__ #endif #define ccl_device_noinline __device__ __noinline__ #define ccl_global @@ -53,6 +57,10 @@ #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ +/* TODO(sergey): In theory we might use references with CUDA, however + * performance impact yet to be investigated. + */ +#define ccl_ref #define ccl_align(n) __align__(n) #define ATTR_FALLTHROUGH diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 4836c290312..36d6031d042 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -42,6 +42,7 @@ #define ccl_local_param __local #define ccl_private __private #define ccl_restrict restrict +#define ccl_ref #define ccl_align(n) __attribute__((aligned(n))) #ifdef __SPLIT_KERNEL__ @@ -129,6 +130,7 @@ # define expf(x) native_exp(((float)(x))) # define sqrtf(x) native_sqrt(((float)(x))) # define logf(x) native_log(((float)(x))) +# define rcp(x) native_recip(x) #else # define sinf(x) sin(((float)(x))) # define cosf(x) cos(((float)(x))) @@ -136,11 +138,12 @@ # define expf(x) exp(((float)(x))) # define sqrtf(x) sqrt(((float)(x))) # define logf(x) log(((float)(x))) +# define rcp(x) recip(x)) #endif /* data lookup defines */ #define kernel_data (*kg->data) -#define kernel_tex_fetch(t, index) kg->t[index] +#define kernel_tex_fetch(tex, index) ((ccl_global tex##_t*)(kg->buffers[kg->tex.buffer] + kg->tex.offset))[(index)] /* define NULL */ #define NULL 0 diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h deleted file mode 100644 index 5647bbae5b5..00000000000 --- a/intern/cycles/kernel/kernel_debug.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2011-2014 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device_inline void debug_data_init(DebugData *debug_data) -{ - debug_data->num_bvh_traversed_nodes = 0; - debug_data->num_bvh_traversed_instances = 0; - debug_data->num_bvh_intersections = 0; - debug_data->num_ray_bounces = 0; -} - -ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, - ccl_global float *buffer, - ccl_addr_space PathState *state, - DebugData *debug_data, - int sample) -{ - int flag = kernel_data.film.pass_flag; - if(flag & PASS_BVH_TRAVERSED_NODES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes, - sample, - debug_data->num_bvh_traversed_nodes); - } - if(flag & PASS_BVH_TRAVERSED_INSTANCES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances, - sample, - debug_data->num_bvh_traversed_instances); - } - if(flag & PASS_BVH_INTERSECTIONS) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections, - sample, - debug_data->num_bvh_intersections); - } - if(flag & PASS_RAY_BOUNCES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces, - sample, - debug_data->num_ray_bounces); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 9e7d51f23f5..48a8e53be33 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -46,7 +46,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, shader_setup_from_background(kg, emission_sd, &ray); path_state_modify_bounce(state, true); - eval = shader_eval_background(kg, emission_sd, state, 0, SHADER_CONTEXT_EMISSION); + eval = shader_eval_background(kg, emission_sd, state, 0); path_state_modify_bounce(state, false); } else @@ -72,7 +72,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, /* no path flag, we're evaluating this for all closures. that's weak but * we'd have to do multiple evaluations otherwise */ path_state_modify_bounce(state, true); - shader_eval_surface(kg, emission_sd, NULL, state, 0.0f, 0, SHADER_CONTEXT_EMISSION); + shader_eval_surface(kg, emission_sd, state, 0.0f, 0); path_state_modify_bounce(state, false); /* evaluate emissive closure */ @@ -216,7 +216,7 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader { /* multiple importance sampling, get triangle light pdf, * and compute weight with respect to BSDF pdf */ - float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t); + float pdf = triangle_light_pdf(kg, sd, t); float mis_weight = power_heuristic(bsdf_pdf, pdf); return L*mis_weight; @@ -319,7 +319,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, # endif path_state_modify_bounce(state, true); - float3 L = shader_eval_background(kg, emission_sd, state, state->flag, SHADER_CONTEXT_EMISSION); + float3 L = shader_eval_background(kg, emission_sd, state, state->flag); path_state_modify_bounce(state, false); #ifdef __BACKGROUND_MIS__ diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index f95f0d98c52..9d55183d94b 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -23,6 +23,10 @@ # include "util/util_vector.h" #endif +#ifdef __KERNEL_OPENCL__ +# include "util/util_atomic.h" +#endif + CCL_NAMESPACE_BEGIN /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in @@ -109,11 +113,22 @@ typedef struct KernelGlobals { #ifdef __KERNEL_OPENCL__ +# define KERNEL_TEX(type, ttype, name) \ +typedef type name##_t; +# include "kernel/kernel_textures.h" + +typedef struct tex_info_t { + uint buffer, padding; + uint64_t offset; + uint width, height, depth, options; +} tex_info_t; + typedef ccl_addr_space struct KernelGlobals { ccl_constant KernelData *data; + ccl_global char *buffers[8]; # define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name; + tex_info_t name; # include "kernel/kernel_textures.h" # ifdef __SPLIT_KERNEL__ @@ -122,6 +137,57 @@ typedef ccl_addr_space struct KernelGlobals { # endif } KernelGlobals; +#define KERNEL_BUFFER_PARAMS \ + ccl_global char *buffer0, \ + ccl_global char *buffer1, \ + ccl_global char *buffer2, \ + ccl_global char *buffer3, \ + ccl_global char *buffer4, \ + ccl_global char *buffer5, \ + ccl_global char *buffer6, \ + ccl_global char *buffer7 + +#define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7 + +ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS) +{ +#ifdef __SPLIT_KERNEL__ + if(ccl_local_id(0) + ccl_local_id(1) == 0) +#endif + { + kg->buffers[0] = buffer0; + kg->buffers[1] = buffer1; + kg->buffers[2] = buffer2; + kg->buffers[3] = buffer3; + kg->buffers[4] = buffer4; + kg->buffers[5] = buffer5; + kg->buffers[6] = buffer6; + kg->buffers[7] = buffer7; + } + +# ifdef __SPLIT_KERNEL__ + ccl_barrier(CCL_LOCAL_MEM_FENCE); +# endif +} + +ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg) +{ +# ifdef __SPLIT_KERNEL__ + if(ccl_local_id(0) + ccl_local_id(1) == 0) +# endif + { + ccl_global tex_info_t *info = (ccl_global tex_info_t*)kg->buffers[0]; + +# define KERNEL_TEX(type, ttype, name) \ + kg->name = *(info++); +# include "kernel/kernel_textures.h" + } + +# ifdef __SPLIT_KERNEL__ + ccl_barrier(CCL_LOCAL_MEM_FENCE); +# endif +} + #endif /* __KERNEL_OPENCL__ */ /* Interpolated lookup table access */ diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h index 90747e09357..9e3373432ec 100644 --- a/intern/cycles/kernel/kernel_image_opencl.h +++ b/intern/cycles/kernel/kernel_image_opencl.h @@ -15,30 +15,42 @@ */ -/* For OpenCL all images are packed in a single array, and we do manual lookup - * and interpolation. */ +/* For OpenCL we do manual lookup and interpolation. */ + +ccl_device_inline ccl_global tex_info_t* kernel_tex_info(KernelGlobals *kg, uint id) { + const uint tex_offset = id +#define KERNEL_TEX(type, ttype, name) + 1 +#include "kernel/kernel_textures.h" + ; + + return &((ccl_global tex_info_t*)kg->buffers[0])[tex_offset]; +} + +#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->buffer] + info->offset))[(index)] ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset) { + const ccl_global tex_info_t *info = kernel_tex_info(kg, id); const int texture_type = kernel_tex_type(id); + /* Float4 */ if(texture_type == IMAGE_DATA_TYPE_FLOAT4) { - return kernel_tex_fetch(__tex_image_float4_packed, offset); + return tex_fetch(float4, info, offset); } /* Byte4 */ else if(texture_type == IMAGE_DATA_TYPE_BYTE4) { - uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset); + uchar4 r = tex_fetch(uchar4, info, offset); float f = 1.0f/255.0f; return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); } /* Float */ else if(texture_type == IMAGE_DATA_TYPE_FLOAT) { - float f = kernel_tex_fetch(__tex_image_float_packed, offset); + float f = tex_fetch(float, info, offset); return make_float4(f, f, f, 1.0f); } /* Byte */ else { - uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset); + uchar r = tex_fetch(uchar, info, offset); float f = r * (1.0f/255.0f); return make_float4(f, f, f, 1.0f); } @@ -64,17 +76,17 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix) return x - (float)i; } -ccl_device_inline uint kernel_decode_image_interpolation(uint4 info) +ccl_device_inline uint kernel_decode_image_interpolation(uint info) { - return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; + return (info & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; } -ccl_device_inline uint kernel_decode_image_extension(uint4 info) +ccl_device_inline uint kernel_decode_image_extension(uint info) { - if(info.w & (1 << 1)) { + if(info & (1 << 1)) { return EXTENSION_REPEAT; } - else if(info.w & (1 << 2)) { + else if(info & (1 << 2)) { return EXTENSION_EXTEND; } else { @@ -84,13 +96,16 @@ ccl_device_inline uint kernel_decode_image_extension(uint4 info) ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { - uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2); - uint width = info.x; - uint height = info.y; - uint offset = info.z; + const ccl_global tex_info_t *info = kernel_tex_info(kg, id); + + uint width = info->width; + uint height = info->height; + uint offset = 0; + /* Decode image options. */ - uint interpolation = kernel_decode_image_interpolation(info); - uint extension = kernel_decode_image_extension(info); + uint interpolation = kernel_decode_image_interpolation(info->options); + uint extension = kernel_decode_image_extension(info->options); + /* Actual sampling. */ float4 r; int ix, iy, nix, niy; @@ -150,14 +165,17 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z) { - uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2); - uint width = info.x; - uint height = info.y; - uint offset = info.z; - uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x; + const ccl_global tex_info_t *info = kernel_tex_info(kg, id); + + uint width = info->width; + uint height = info->height; + uint offset = 0; + uint depth = info->depth; + /* Decode image options. */ - uint interpolation = kernel_decode_image_interpolation(info); - uint extension = kernel_decode_image_extension(info); + uint interpolation = kernel_decode_image_interpolation(info->options); + uint extension = kernel_decode_image_extension(info->options); + /* Actual sampling. */ float4 r; int ix, iy, iz, nix, niy, niz; diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index 9baa9d54957..59db6cbd430 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -763,60 +763,252 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, /* Triangle Light */ -ccl_device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls, int object, float time) +/* returns true if the triangle is has motion blur or an instancing transform applied */ +ccl_device_inline bool triangle_world_space_vertices(KernelGlobals *kg, int object, int prim, float time, float3 V[3]) { + bool has_motion = false; + const int object_flag = kernel_tex_fetch(__object_flag, object); + + if(object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) { + motion_triangle_vertices(kg, object, prim, time, V); + has_motion = true; + } else { + triangle_vertices(kg, prim, V); + } + #ifdef __INSTANCING__ - /* instance transform */ - if(!(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED)) { + if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { # ifdef __OBJECT_MOTION__ - Transform itfm; - Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm); + Transform tfm = object_fetch_transform_motion_test(kg, object, time, NULL); # else Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); # endif - - ls->P = transform_point(&tfm, ls->P); - ls->Ng = normalize(transform_direction(&tfm, ls->Ng)); + V[0] = transform_point(&tfm, V[0]); + V[1] = transform_point(&tfm, V[1]); + V[2] = transform_point(&tfm, V[2]); + has_motion = true; } #endif + return has_motion; } -ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object, - float randu, float randv, float time, LightSample *ls) +ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg, const float3 Ng, const float3 I, float t) { - float u, v; + float pdf = kernel_data.integrator.pdf_triangles; + float cos_pi = fabsf(dot(Ng, I)); - /* compute random point in triangle */ - randu = sqrtf(randu); + if(cos_pi == 0.0f) + return 0.0f; - u = 1.0f - randu; - v = randv*randu; + return t*t*pdf/cos_pi; +} - /* triangle, so get position, normal, shader */ - triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader); +ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t) +{ + /* A naive heuristic to decide between costly solid angle sampling + * and simple area sampling, comparing the distance to the triangle plane + * to the length of the edges of the triangle. */ + + float3 V[3]; + bool has_motion = triangle_world_space_vertices(kg, sd->object, sd->prim, sd->time, V); + + const float3 e0 = V[1] - V[0]; + const float3 e1 = V[2] - V[0]; + const float3 e2 = V[2] - V[1]; + const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2))); + const float3 N = cross(e0, e1); + const float distance_to_plane = fabsf(dot(N, sd->I * t))/dot(N, N); + + if(longest_edge_squared > distance_to_plane*distance_to_plane) { + /* sd contains the point on the light source + * calculate Px, the point that we're shading */ + const float3 Px = sd->P + sd->I * t; + const float3 v0_p = V[0] - Px; + const float3 v1_p = V[1] - Px; + const float3 v2_p = V[2] - Px; + + const float3 u01 = safe_normalize(cross(v0_p, v1_p)); + const float3 u02 = safe_normalize(cross(v0_p, v2_p)); + const float3 u12 = safe_normalize(cross(v1_p, v2_p)); + + const float alpha = fast_acosf(dot(u02, u01)); + const float beta = fast_acosf(-dot(u01, u12)); + const float gamma = fast_acosf(dot(u02, u12)); + const float solid_angle = alpha + beta + gamma - M_PI_F; + + /* pdf_triangles is calculated over triangle area, but we're not sampling over its area */ + if(UNLIKELY(solid_angle == 0.0f)) { + return 0.0f; + } else { + float area = 1.0f; + if(has_motion) { + /* get the center frame vertices, this is what the PDF was calculated from */ + triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V); + area = triangle_area(V[0], V[1], V[2]); + } else { + area = 0.5f * len(N); + } + const float pdf = area * kernel_data.integrator.pdf_triangles; + return pdf / solid_angle; + } + } + else { + float pdf = triangle_light_pdf_area(kg, sd->Ng, sd->I, t); + if(has_motion) { + const float area = 0.5f * len(N); + if(UNLIKELY(area == 0.0f)) { + return 0.0f; + } + /* scale the PDF. + * area = the area the sample was taken from + * area_pre = the are from which pdf_triangles was calculated from */ + triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V); + const float area_pre = triangle_area(V[0], V[1], V[2]); + pdf = pdf * area_pre / area; + } + return pdf; + } +} + +ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, int prim, int object, + float randu, float randv, float time, LightSample *ls, const float3 P) +{ + /* A naive heuristic to decide between costly solid angle sampling + * and simple area sampling, comparing the distance to the triangle plane + * to the length of the edges of the triangle. */ + + float3 V[3]; + bool has_motion = triangle_world_space_vertices(kg, object, prim, time, V); + + const float3 e0 = V[1] - V[0]; + const float3 e1 = V[2] - V[0]; + const float3 e2 = V[2] - V[1]; + const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2))); + const float3 N0 = cross(e0, e1); + float Nl = 0.0f; + ls->Ng = safe_normalize_len(N0, &Nl); + float area = 0.5f * Nl; + + /* flip normal if necessary */ + const int object_flag = kernel_tex_fetch(__object_flag, object); + if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + ls->Ng = -ls->Ng; + } + ls->eval_fac = 1.0f; + ls->shader = kernel_tex_fetch(__tri_shader, prim); ls->object = object; ls->prim = prim; ls->lamp = LAMP_NONE; ls->shader |= SHADER_USE_MIS; - ls->t = 0.0f; - ls->u = u; - ls->v = v; ls->type = LIGHT_TRIANGLE; - ls->eval_fac = 1.0f; - object_transform_light_sample(kg, ls, object, time); -} + float distance_to_plane = fabsf(dot(N0, V[0] - P)/dot(N0, N0)); + + if(longest_edge_squared > distance_to_plane*distance_to_plane) { + /* see James Arvo, "Stratified Sampling of Spherical Triangles" + * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */ + + /* project the triangle to the unit sphere + * and calculate its edges and angles */ + const float3 v0_p = V[0] - P; + const float3 v1_p = V[1] - P; + const float3 v2_p = V[2] - P; + + const float3 u01 = safe_normalize(cross(v0_p, v1_p)); + const float3 u02 = safe_normalize(cross(v0_p, v2_p)); + const float3 u12 = safe_normalize(cross(v1_p, v2_p)); + + const float3 A = safe_normalize(v0_p); + const float3 B = safe_normalize(v1_p); + const float3 C = safe_normalize(v2_p); + + const float cos_alpha = dot(u02, u01); + const float cos_beta = -dot(u01, u12); + const float cos_gamma = dot(u02, u12); + + /* calculate dihedral angles */ + const float alpha = fast_acosf(cos_alpha); + const float beta = fast_acosf(cos_beta); + const float gamma = fast_acosf(cos_gamma); + /* the area of the unit spherical triangle = solid angle */ + const float solid_angle = alpha + beta + gamma - M_PI_F; + + /* precompute a few things + * these could be re-used to take several samples + * as they are independent of randu/randv */ + const float cos_c = dot(A, B); + const float sin_alpha = fast_sinf(alpha); + const float product = sin_alpha * cos_c; + + /* Select a random sub-area of the spherical triangle + * and calculate the third vertex C_ of that new triangle */ + const float phi = randu * solid_angle - alpha; + float s, t; + fast_sincosf(phi, &s, &t); + const float u = t - cos_alpha; + const float v = s + product; + + const float3 U = safe_normalize(C - dot(C, A) * A); + + float q = 1.0f; + const float det = ((v * s + u * t) * sin_alpha); + if(det != 0.0f) { + q = ((v * t - u * s) * cos_alpha - v) / det; + } + const float temp = max(1.0f - q*q, 0.0f); -ccl_device float triangle_light_pdf(KernelGlobals *kg, - const float3 Ng, const float3 I, float t) -{ - float pdf = kernel_data.integrator.pdf_triangles; - float cos_pi = fabsf(dot(Ng, I)); + const float3 C_ = safe_normalize(q * A + sqrtf(temp) * U); - if(cos_pi == 0.0f) - return 0.0f; - - return t*t*pdf/cos_pi; + /* Finally, select a random point along the edge of the new triangle + * That point on the spherical triangle is the sampled ray direction */ + const float z = 1.0f - randv * (1.0f - dot(C_, B)); + ls->D = z * B + safe_sqrtf(1.0f - z*z) * safe_normalize(C_ - dot(C_, B) * B); + + /* calculate intersection with the planar triangle */ + ray_triangle_intersect(P, ls->D, FLT_MAX, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + (ssef*)V, +#else + V[0], V[1], V[2], +#endif + &ls->u, &ls->v, &ls->t); + ls->P = P + ls->D * ls->t; + + /* pdf_triangles is calculated over triangle area, but we're sampling over solid angle */ + if(UNLIKELY(solid_angle == 0.0f)) { + ls->pdf = 0.0f; + } else { + if(has_motion) { + /* get the center frame vertices, this is what the PDF was calculated from */ + triangle_world_space_vertices(kg, object, prim, -1.0f, V); + area = triangle_area(V[0], V[1], V[2]); + } + const float pdf = area * kernel_data.integrator.pdf_triangles; + ls->pdf = pdf / solid_angle; + } + } + else { + /* compute random point in triangle */ + randu = sqrtf(randu); + + const float u = 1.0f - randu; + const float v = randv*randu; + const float t = 1.0f - u - v; + ls->P = u * V[0] + v * V[1] + t * V[2]; + /* compute incoming direction, distance and pdf */ + ls->D = normalize_len(ls->P - P, &ls->t); + ls->pdf = triangle_light_pdf_area(kg, ls->Ng, -ls->D, ls->t); + if(has_motion && area != 0.0f) { + /* scale the PDF. + * area = the area the sample was taken from + * area_pre = the are from which pdf_triangles was calculated from */ + triangle_world_space_vertices(kg, object, prim, -1.0f, V); + const float area_pre = triangle_area(V[0], V[1], V[2]); + ls->pdf = ls->pdf * area_pre / area; + } + ls->u = u; + ls->v = v; + } } /* Light Distribution */ @@ -876,10 +1068,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg, int object = __float_as_int(l.w); int shader_flag = __float_as_int(l.z); - triangle_light_sample(kg, prim, object, randu, randv, time, ls); - /* compute incoming direction, distance and pdf */ - ls->D = normalize_len(ls->P - P, &ls->t); - ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t); + triangle_light_sample(kg, prim, object, randu, randv, time, ls, P); ls->shader |= shader_flag; return (ls->pdf > 0.0f); } diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 9d52834ffcc..d454cce6e30 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -142,7 +142,7 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_glob ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, ShaderData *sd, - ccl_global PathState *state, + ccl_addr_space PathState *state, PathRadiance *L) { #ifdef __DENOISING_FEATURES__ @@ -194,6 +194,36 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, #endif /* __DENOISING_FEATURES__ */ } +#ifdef __KERNEL_DEBUG__ +ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, + ccl_global float *buffer, + PathRadiance *L, + int sample) +{ + int flag = kernel_data.film.pass_flag; + if(flag & PASS_BVH_TRAVERSED_NODES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes, + sample, + L->debug_data.num_bvh_traversed_nodes); + } + if(flag & PASS_BVH_TRAVERSED_INSTANCES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances, + sample, + L->debug_data.num_bvh_traversed_instances); + } + if(flag & PASS_BVH_INTERSECTIONS) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections, + sample, + L->debug_data.num_bvh_intersections); + } + if(flag & PASS_RAY_BOUNCES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces, + sample, + L->debug_data.num_ray_bounces); + } +} +#endif /* __KERNEL_DEBUG__ */ + ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput) { @@ -230,7 +260,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl } if(flag & PASS_NORMAL) { - float3 normal = sd->N; + float3 normal = shader_bsdf_average_normal(kg, sd); kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal); } if(flag & PASS_UV) { @@ -334,10 +364,12 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f } ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer, - int sample, PathRadiance *L, float alpha, bool is_shadow_catcher) + int sample, PathRadiance *L, bool is_shadow_catcher) { if(L) { float3 L_sum; + float alpha = 1.0f - L->transparent; + #ifdef __SHADOW_TRICKS__ if(is_shadow_catcher) { L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha); @@ -389,6 +421,11 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float * sample, L->denoising_depth); } #endif /* __DENOISING_FEATURES__ */ + + +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, buffer, L, sample); +#endif } else { kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f)); diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index fc093ad8319..3319e2c2435 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -48,10 +48,6 @@ #include "kernel/kernel_path_volume.h" #include "kernel/kernel_path_subsurface.h" -#ifdef __KERNEL_DEBUG__ -# include "kernel/kernel_debug.h" -#endif - CCL_NAMESPACE_BEGIN ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, @@ -59,14 +55,13 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, ShaderData *emission_sd, PathRadiance *L, ccl_addr_space PathState *state, - RNG *rng, float3 throughput, float3 ao_alpha) { /* todo: solve correlation */ float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); float ao_factor = kernel_data.background.ao_factor; float3 ao_N; @@ -89,7 +84,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) { path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow); } else { @@ -100,10 +95,11 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, #ifndef __SPLIT_KERNEL__ +#if defined(__BRANCHED_PATH__) || defined(__BAKING__) + ccl_device void kernel_path_indirect(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, - RNG *rng, Ray *ray, float3 throughput, int num_samples, @@ -202,7 +198,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* direct light sampling */ kernel_branched_path_volume_connect_light(kg, - rng, sd, emission_sd, throughput, @@ -215,8 +210,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* indirect sample. if we use distance sampling and take just * one sample for direct and indirect light, we could share * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE); result = kernel_volume_decoupled_scatter(kg, state, @@ -235,7 +230,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, if(result == VOLUME_PATH_SCATTERED) { if(kernel_path_volume_bounce(kg, - rng, sd, &throughput, state, @@ -257,13 +251,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, { /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous); + kg, state, sd, &volume_ray, L, &throughput, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ kernel_path_volume_connect_light(kg, - rng, sd, emission_sd, throughput, @@ -272,7 +265,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* indirect light bounce */ if(kernel_path_volume_bounce(kg, - rng, sd, &throughput, state, @@ -311,15 +303,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, sd, &isect, ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); - shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT); + float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF); + shader_eval_surface(kg, sd, state, rbsdf, state->flag); #ifdef __BRANCHED_PATH__ shader_merge_closures(sd); #endif /* __BRANCHED_PATH__ */ #ifdef __SHADOW_TRICKS__ - if(!(sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { - state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; + if(!(sd->object_flag & SD_OBJECT_SHADOW_CATCHER) && + (state->flag & PATH_RAY_SHADOW_CATCHER)) + { + /* Only update transparency after shadow catcher bounce. */ + L->shadow_transparency *= + average(shader_bsdf_transparency(kg, sd)); } #endif /* __SHADOW_TRICKS__ */ @@ -350,7 +346,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ float probability = - path_state_terminate_probability(kg, + path_state_continuation_probability(kg, state, throughput*num_samples); @@ -358,7 +354,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -371,7 +367,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { - kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f)); + kernel_path_ao(kg, sd, emission_sd, L, state, throughput, make_float3(0.0f, 0.0f, 0.0f)); } #endif /* __AO__ */ @@ -387,11 +383,10 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* do bssrdf scatter step if we picked a bssrdf closure */ if(sc) { - uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb); + uint lcg_state = lcg_state_init(state, 0x68bc21eb); float bssrdf_u, bssrdf_v; path_state_rng_2D(kg, - rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); @@ -412,7 +407,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, int all = (kernel_data.integrator.sample_all_lights_indirect) || (state->flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, - rng, sd, emission_sd, state, @@ -423,23 +417,23 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } #endif /* defined(__EMISSION__) */ - if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray)) + if(!kernel_path_surface_bounce(kg, sd, &throughput, state, L, ray)) break; } } +#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */ -ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, - RNG *rng, - int sample, - Ray ray, - ccl_global float *buffer, - PathRadiance *L, - bool *is_shadow_catcher) +ccl_device_inline void kernel_path_integrate(KernelGlobals *kg, + uint rng_hash, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L, + bool *is_shadow_catcher) { /* initialize */ float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float L_transparent = 0.0f; path_radiance_init(L, kernel_data.film.use_light_pass); @@ -449,12 +443,7 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, ShaderData emission_sd; PathState state; - path_state_init(kg, &emission_sd, &state, rng, sample, &ray); - -#ifdef __KERNEL_DEBUG__ - DebugData debug_data; - debug_data_init(&debug_data); -#endif /* __KERNEL_DEBUG__ */ + path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray); #ifdef __SUBSURFACE__ SubsurfaceIndirectRays ss_indirect; @@ -481,7 +470,7 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, } extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d); + lcg_state = lcg_state_init(&state, 0x51633e2d); } if(state.bounce > kernel_data.integrator.ao_bounces) { @@ -496,11 +485,11 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, #ifdef __KERNEL_DEBUG__ if(state.flag & PATH_RAY_CAMERA) { - debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes; - debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; - debug_data.num_bvh_intersections += isect.num_intersections; + L->debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes; + L->debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; + L->debug_data.num_bvh_intersections += isect.num_intersections; } - debug_data.num_ray_bounces++; + L->debug_data.num_ray_bounces++; #endif /* __KERNEL_DEBUG__ */ #ifdef __LAMP_MIS__ @@ -561,15 +550,15 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, int all = false; /* direct light sampling */ - kernel_branched_path_volume_connect_light(kg, rng, &sd, + kernel_branched_path_volume_connect_light(kg, &sd, &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); /* indirect sample. if we use distance sampling and take just * one sample for direct and indirect light, we could share * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, &state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, &state, PRNG_SCATTER_DISTANCE); result = kernel_volume_decoupled_scatter(kg, &state, &volume_ray, &sd, &throughput, @@ -580,7 +569,7 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, kernel_volume_decoupled_free(kg, &volume_segment); if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) + if(kernel_path_volume_bounce(kg, &sd, &throughput, &state, L, &ray)) continue; else break; @@ -594,15 +583,15 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, { /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous); + kg, &state, &sd, &volume_ray, L, &throughput, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); + kernel_path_volume_connect_light(kg, &sd, &emission_sd, throughput, &state, L); /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) + if(kernel_path_volume_bounce(kg, &sd, &throughput, &state, L, &ray)) continue; else break; @@ -615,7 +604,7 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, if(!hit) { /* eval background shader if nothing hit */ if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) { - L_transparent += average(throughput); + L->transparent += average(throughput); #ifdef __PASSES__ if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) @@ -637,21 +626,26 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, /* setup shading */ shader_setup_from_ray(kg, &sd, &isect, &ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); - shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); + float rbsdf = path_state_rng_1D_for_decision(kg, &state, PRNG_BSDF); + shader_eval_surface(kg, &sd, &state, rbsdf, state.flag); #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); - state.catcher_object = sd.object; + state.flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_STORE_SHADOW_INFO); if(!kernel_data.background.transparent) { - L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray); + L->shadow_background_color = + indirect_background(kg, &emission_sd, &state, &ray); } + L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L); + L->shadow_throughput = average(throughput); } } - else { - state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; + else if(state.flag & PATH_RAY_SHADOW_CATCHER) { + /* Only update transparency after shadow catcher bounce. */ + L->shadow_transparency *= + average(shader_bsdf_transparency(kg, &sd)); } #endif /* __SHADOW_TRICKS__ */ @@ -670,7 +664,7 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, holdout_weight = shader_holdout_eval(kg, &sd); } /* any throughput is ok, should all be identical here */ - L_transparent += average(holdout_weight*throughput); + L->transparent += average(holdout_weight*throughput); } if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) { @@ -705,13 +699,13 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_terminate_probability(kg, &state, throughput); + float probability = path_state_continuation_probability(kg, &state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -723,7 +717,7 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); + kernel_path_ao(kg, &sd, &emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, &sd)); } #endif /* __AO__ */ @@ -736,7 +730,6 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, &emission_sd, L, &state, - rng, &ray, &throughput, &ss_indirect)) @@ -747,10 +740,10 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ /* direct lighting */ - kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); + kernel_path_surface_connect_light(kg, &sd, &emission_sd, throughput, &state, L); /* compute direct lighting and next bounce */ - if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) + if(!kernel_path_surface_bounce(kg, &sd, &throughput, &state, L, &ray)) break; } @@ -775,14 +768,8 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ #ifdef __SHADOW_TRICKS__ - *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); + *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER) != 0; #endif /* __SHADOW_TRICKS__ */ - -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); -#endif /* __KERNEL_DEBUG__ */ - - return 1.0f - L_transparent; } ccl_device void kernel_path_trace(KernelGlobals *kg, @@ -797,24 +784,22 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, buffer += index*pass_stride; /* initialize random numbers and ray */ - RNG rng; + uint rng_hash; Ray ray; - kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); + kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, &ray); /* integrate */ PathRadiance L; bool is_shadow_catcher; if(ray.t != 0.0f) { - float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); - kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + kernel_path_integrate(kg, rng_hash, sample, ray, buffer, &L, &is_shadow_catcher); + kernel_write_result(kg, buffer, sample, &L, is_shadow_catcher); } else { - kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + kernel_write_result(kg, buffer, sample, NULL, false); } - - path_rng_end(kg, rng_state, rng); } #endif /* __SPLIT_KERNEL__ */ diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index 10816d3e5d1..dde40674ee6 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -23,7 +23,6 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *emission_sd, PathRadiance *L, ccl_addr_space PathState *state, - RNG *rng, float3 throughput) { int num_samples = kernel_data.integrator.ao_samples; @@ -35,7 +34,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, for(int j = 0; j < num_samples; j++) { float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); float3 ao_D; float ao_pdf; @@ -55,7 +54,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) { path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow); } else { @@ -69,7 +68,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, /* bounce off surface and integrate indirect light */ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, + ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { float sum_sample_weight = 0.0f; @@ -113,15 +112,18 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba num_samples = ceil_to_int(num_samples_adjust*num_samples); float num_samples_inv = num_samples_adjust/num_samples; - RNG bsdf_rng = cmj_hash(*rng, i); for(int j = 0; j < num_samples; j++) { PathState ps = *state; float3 tp = throughput; Ray bsdf_ray; +#ifdef __SHADOW_TRICKS__ + float shadow_transparency = L->shadow_transparency; +#endif + + ps.rng_hash = cmj_hash(state->rng_hash, i); if(!kernel_branched_path_surface_bounce(kg, - &bsdf_rng, sd, sc, j, @@ -135,10 +137,11 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba continue; } + ps.rng_hash = state->rng_hash; + kernel_path_indirect(kg, indirect_sd, emission_sd, - rng, &bsdf_ray, tp*num_samples_inv, num_samples, @@ -149,6 +152,10 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba * for the next samples */ path_radiance_sum_indirect(L); path_radiance_reset_indirect(L); + +#ifdef __SHADOW_TRICKS__ + L->shadow_transparency = shadow_transparency; +#endif } } } @@ -160,7 +167,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, ShaderData *emission_sd, PathRadiance *L, PathState *state, - RNG *rng, Ray *ray, float3 throughput) { @@ -171,17 +177,17 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, continue; /* set up random number generator */ - uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb); + uint lcg_state = lcg_state_init(state, 0x68bc21eb); int num_samples = kernel_data.integrator.subsurface_samples; float num_samples_inv = 1.0f/num_samples; - RNG bssrdf_rng = cmj_hash(*rng, i); + uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i); /* do subsurface scatter step with copy of shader data, this will * replace the BSSRDF with a diffuse BSDF closure */ for(int j = 0; j < num_samples; j++) { SubsurfaceIntersection ss_isect; float bssrdf_u, bssrdf_v; - path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + path_branched_rng_2D(kg, bssrdf_rng_hash, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); int num_hits = subsurface_scatter_multi_intersect(kg, &ss_isect, sd, @@ -234,7 +240,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, (state->flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light( kg, - rng, &bssrdf_sd, emission_sd, &hit_state, @@ -248,7 +253,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, /* indirect light */ kernel_branched_path_surface_indirect_light( kg, - rng, &bssrdf_sd, indirect_sd, emission_sd, @@ -262,17 +266,16 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, - RNG *rng, - int sample, - Ray ray, - ccl_global float *buffer, - PathRadiance *L, - bool *is_shadow_catcher) +ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, + uint rng_hash, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L, + bool *is_shadow_catcher) { /* initialize */ float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float L_transparent = 0.0f; path_radiance_init(L, kernel_data.film.use_light_pass); @@ -282,12 +285,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, ShaderData emission_sd, indirect_sd; PathState state; - path_state_init(kg, &emission_sd, &state, rng, sample, &ray); - -#ifdef __KERNEL_DEBUG__ - DebugData debug_data; - debug_data_init(&debug_data); -#endif /* __KERNEL_DEBUG__ */ + path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray); /* Main Loop * Here we only handle transparency intersections from the camera ray. @@ -310,7 +308,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, } extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d); + lcg_state = lcg_state_init(&state, 0x51633e2d); } bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); @@ -319,10 +317,10 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, #endif /* __HAIR__ */ #ifdef __KERNEL_DEBUG__ - debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes; - debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; - debug_data.num_bvh_intersections += isect.num_intersections; - debug_data.num_ray_bounces++; + L->debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes; + L->debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; + L->debug_data.num_bvh_intersections += isect.num_intersections; + L->debug_data.num_ray_bounces++; #endif /* __KERNEL_DEBUG__ */ #ifdef __VOLUME__ @@ -353,7 +351,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, int all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_volume_connect_light(kg, rng, &sd, + kernel_branched_path_volume_connect_light(kg, &sd, &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); @@ -372,8 +370,8 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, /* scatter sample. if we use distance sampling and take just one * sample for direct and indirect light, we could share this * computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, &ps, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, &ps, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, &ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false); @@ -382,7 +380,6 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, kernel_assert(result == VOLUME_PATH_SCATTERED); if(kernel_path_volume_bounce(kg, - rng, &sd, &tp, &ps, @@ -392,7 +389,6 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, kernel_path_indirect(kg, &indirect_sd, &emission_sd, - rng, &pray, tp*num_samples_inv, num_samples, @@ -431,16 +427,15 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, path_state_branch(&ps, j, num_samples); VolumeIntegrateResult result = kernel_volume_integrate( - kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous); + kg, &ps, &sd, &volume_ray, L, &tp, heterogeneous); #ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* todo: support equiangular, MIS and all light sampling. * alternatively get decoupled ray marching working on the GPU */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L); + kernel_path_volume_connect_light(kg, &sd, &emission_sd, tp, &state, L); if(kernel_path_volume_bounce(kg, - rng, &sd, &tp, &ps, @@ -450,7 +445,6 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, kernel_path_indirect(kg, &indirect_sd, &emission_sd, - rng, &pray, tp, num_samples, @@ -475,7 +469,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, if(!hit) { /* eval background shader if nothing hit */ if(kernel_data.background.transparent) { - L_transparent += average(throughput); + L->transparent += average(throughput); #ifdef __PASSES__ if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) @@ -494,21 +488,24 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, /* setup shading */ shader_setup_from_ray(kg, &sd, &isect, &ray); - shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, &sd, &state, 0.0f, state.flag); shader_merge_closures(&sd); #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { - if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); - state.catcher_object = sd.object; - if(!kernel_data.background.transparent) { - L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray); - } + state.flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_STORE_SHADOW_INFO); + if(!kernel_data.background.transparent) { + L->shadow_background_color = + indirect_background(kg, &emission_sd, &state, &ray); } + L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L); + L->shadow_throughput = average(throughput); } - else { - state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; + else if(state.flag & PATH_RAY_SHADOW_CATCHER) { + /* Only update transparency after shadow catcher bounce. */ + L->shadow_transparency *= + average(shader_bsdf_transparency(kg, &sd)); } #endif /* __SHADOW_TRICKS__ */ @@ -524,7 +521,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, holdout_weight = shader_holdout_eval(kg, &sd); } /* any throughput is ok, should all be identical here */ - L_transparent += average(holdout_weight*throughput); + L->transparent += average(holdout_weight*throughput); } if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) { break; @@ -548,13 +545,13 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_terminate_probability(kg, &state, throughput); + float probability = path_state_continuation_probability(kg, &state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -568,7 +565,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput); + kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, throughput); } #endif /* __AO__ */ @@ -576,7 +573,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd, - L, &state, rng, &ray, throughput); + L, &state, &ray, throughput); } #endif /* __SUBSURFACE__ */ @@ -588,13 +585,13 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, if(kernel_data.integrator.use_direct_light) { int all = (kernel_data.integrator.sample_all_lights_direct) || (state.flag & PATH_RAY_SHADOW_CATCHER); - kernel_branched_path_surface_connect_light(kg, rng, + kernel_branched_path_surface_connect_light(kg, &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all); } #endif /* __EMISSION__ */ /* indirect light */ - kernel_branched_path_surface_indirect_light(kg, rng, + kernel_branched_path_surface_indirect_light(kg, &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L); /* continue in case of transparency */ @@ -625,14 +622,8 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, } #ifdef __SHADOW_TRICKS__ - *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); + *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER) != 0; #endif /* __SHADOW_TRICKS__ */ - -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); -#endif /* __KERNEL_DEBUG__ */ - - return 1.0f - L_transparent; } ccl_device void kernel_branched_path_trace(KernelGlobals *kg, @@ -647,24 +638,22 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, buffer += index*pass_stride; /* initialize random numbers and ray */ - RNG rng; + uint rng_hash; Ray ray; - kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); + kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, &ray); /* integrate */ PathRadiance L; bool is_shadow_catcher; if(ray.t != 0.0f) { - float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); - kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L, &is_shadow_catcher); + kernel_write_result(kg, buffer, sample, &L, is_shadow_catcher); } else { - kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + kernel_write_result(kg, buffer, sample, NULL, false); } - - path_rng_end(kg, rng_state, rng); } #endif /* __SPLIT_KERNEL__ */ diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h index 82f83deb595..54dd278a185 100644 --- a/intern/cycles/kernel/kernel_path_common.h +++ b/intern/cycles/kernel/kernel_path_common.h @@ -22,7 +22,7 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, - RNG *rng, + uint *rng_hash, ccl_addr_space Ray *ray) { float filter_u; @@ -34,20 +34,20 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, *rng_state = hash_int_2d(x, y); } - path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v); + path_rng_init(kg, rng_state, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v); /* sample camera ray */ float lens_u = 0.0f, lens_v = 0.0f; if(kernel_data.cam.aperturesize > 0.0f) - path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); + path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); float time = 0.0f; #ifdef __CAMERA_MOTION__ if(kernel_data.cam.shuttertime != -1.0f) - time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME); + time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME); #endif camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray); diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index 0fa77d9e8bd..b539224db31 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -19,12 +19,13 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void path_state_init(KernelGlobals *kg, ShaderData *stack_sd, ccl_addr_space PathState *state, - RNG *rng, + uint rng_hash, int sample, ccl_addr_space Ray *ray) { state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP; + state->rng_hash = rng_hash; state->rng_offset = PRNG_BASE_NUM; state->sample = sample; state->num_samples = kernel_data.integrator.aa_samples; @@ -58,16 +59,12 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, /* Initialize volume stack with volume we are inside of. */ kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack); /* Seed RNG for cases where we can't use stratified samples .*/ - state->rng_congruential = lcg_init(*rng + sample*0x51633e2d); + state->rng_congruential = lcg_init(rng_hash + sample*0x51633e2d); } else { state->volume_stack[0].shader = SHADER_NONE; } #endif - -#ifdef __SHADOW_TRICKS__ - state->catcher_object = OBJECT_NONE; -#endif } ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label) @@ -139,9 +136,11 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta /* random number generator next bounce */ state->rng_offset += PRNG_BOUNCE_NUM; +#ifdef __DENOISING_FEATURES__ if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) { state->flag &= ~PATH_RAY_STORE_SHADOW_INFO; } +#endif } ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state) @@ -158,17 +157,26 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s return flag; } -ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput) +ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput) { if(state->flag & PATH_RAY_TRANSPARENT) { - /* transparent rays treated separately */ - if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) + /* Transparent rays are treated separately with own max bounces. */ + if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) { return 0.0f; - else if(state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) + } + /* Do at least one bounce without RR. */ + else if(state->transparent_bounce <= 1) { return 1.0f; + } +#ifdef __SHADOW_TRICKS__ + /* Exception for shadow catcher not working correctly with RR. */ + else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) { + return 1.0f; + } +#endif } else { - /* other rays */ + /* Test max bounces for various ray types. */ if((state->bounce >= kernel_data.integrator.max_bounce) || (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) || (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) || @@ -179,13 +187,21 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_ { return 0.0f; } - else if(state->bounce <= kernel_data.integrator.min_bounce) { + /* Do at least one bounce without RR. */ + else if(state->bounce <= 1) { return 1.0f; } +#ifdef __SHADOW_TRICKS__ + /* Exception for shadow catcher not working correctly with RR. */ + else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) { + return 1.0f; + } +#endif } - /* probalistic termination */ - return average(throughput); /* todo: try using max here */ + /* Probalistic termination: use sqrt() to roughly match typical view + * transform and do path termination a bit later on average. */ + return sqrtf(max3(fabs(throughput))); } /* TODO(DingTo): Find more meaningful name for this */ diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h index 10b568ac3dd..9bccc9201e0 100644 --- a/intern/cycles/kernel/kernel_path_subsurface.h +++ b/intern/cycles/kernel/kernel_path_subsurface.h @@ -28,7 +28,6 @@ bool kernel_path_subsurface_scatter( ShaderData *emission_sd, PathRadiance *L, ccl_addr_space PathState *state, - RNG *rng, ccl_addr_space Ray *ray, ccl_addr_space float3 *throughput, ccl_addr_space SubsurfaceIndirectRays *ss_indirect) @@ -47,11 +46,11 @@ bool kernel_path_subsurface_scatter( */ kernel_assert(!ss_indirect->tracing); - uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb); + uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb); SubsurfaceIntersection ss_isect; float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); int num_hits = subsurface_scatter_multi_intersect(kg, &ss_isect, sd, @@ -94,10 +93,9 @@ bool kernel_path_subsurface_scatter( hit_L->direct_throughput = L->direct_throughput; path_radiance_copy_indirect(hit_L, L); - kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L); + kernel_path_surface_connect_light(kg, sd, emission_sd, *hit_tp, state, hit_L); if(kernel_path_surface_bounce(kg, - rng, sd, hit_tp, hit_state, diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index dcb577e176f..6c3a444e48a 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -20,7 +20,6 @@ CCL_NAMESPACE_BEGIN /* branched path tracing: connect path directly to position on one or more lights and add it to L */ ccl_device_noinline void kernel_branched_path_surface_connect_light( KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *emission_sd, ccl_addr_space PathState *state, @@ -50,12 +49,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i)); float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights); - RNG lamp_rng = cmj_hash(*rng, i); + uint lamp_rng_hash = cmj_hash(state->rng_hash, i); for(int j = 0; j < num_samples; j++) { float light_u, light_v; - path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples); + path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_branched_rng_light_termination(kg, lamp_rng_hash, state, j, num_samples); LightSample ls; if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { @@ -68,7 +67,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } @@ -86,10 +85,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( float num_samples_inv = num_samples_adjust/num_samples; for(int j = 0; j < num_samples; j++) { - float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT); + float light_t = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT); float light_u, light_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); /* only sample triangle lights */ if(kernel_data.integrator.num_all_lights) @@ -105,7 +104,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } @@ -119,10 +118,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( } else { /* sample one light at random */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_t = path_state_rng_1D(kg, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_state_rng_light_termination(kg, rng, state); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_state_rng_light_termination(kg, state); LightSample ls; if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { @@ -131,7 +130,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp); } @@ -147,7 +146,6 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( /* branched path tracing: bounce off or through surface to with new direction stored in ray */ ccl_device bool kernel_branched_path_surface_bounce( KernelGlobals *kg, - RNG *rng, ShaderData *sd, const ShaderClosure *sc, int sample, @@ -164,7 +162,7 @@ ccl_device bool kernel_branched_path_surface_bounce( float3 bsdf_omega_in; differential3 bsdf_domega_in; float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_branched_rng_2D(kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, @@ -217,7 +215,7 @@ ccl_device bool kernel_branched_path_surface_bounce( #endif /* path tracing: connect path directly to position on a light and add it to L */ -ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng, +ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L) { @@ -228,7 +226,6 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG #ifdef __SHADOW_TRICKS__ if(state->flag & PATH_RAY_SHADOW_CATCHER) { kernel_branched_path_surface_connect_light(kg, - rng, sd, emission_sd, state, @@ -241,9 +238,9 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG #endif /* sample illumination from lights to find path contribution */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_t = path_state_rng_1D(kg, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); Ray light_ray; BsdfEval L_light; @@ -255,12 +252,12 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG LightSample ls; if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - float terminate = path_state_rng_light_termination(kg, rng, state); + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } @@ -274,7 +271,6 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG /* path tracing: bounce off or through surface to with new direction stored in ray */ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, - RNG *rng, ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, @@ -289,7 +285,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, float3 bsdf_omega_in; differential3 bsdf_domega_in; float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval, diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index dcedf51e479..c9c7f447c42 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -20,7 +20,6 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_path_volume_connect_light( KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *emission_sd, float3 throughput, @@ -32,9 +31,9 @@ ccl_device_inline void kernel_path_volume_connect_light( return; /* sample illumination from lights to find path contribution */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_t = path_state_rng_1D(kg, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); Ray light_ray; BsdfEval L_light; @@ -48,12 +47,12 @@ ccl_device_inline void kernel_path_volume_connect_light( if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - float terminate = path_state_rng_light_termination(kg, rng, state); + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } @@ -69,7 +68,6 @@ ccl_device #endif bool kernel_path_volume_bounce( KernelGlobals *kg, - RNG *rng, ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, @@ -82,7 +80,7 @@ bool kernel_path_volume_bounce( float3 phase_omega_in; differential3 phase_domega_in; float phase_u, phase_v; - path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v); + path_state_rng_2D(kg, state, PRNG_PHASE_U, &phase_u, &phase_v); int label; label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval, @@ -120,7 +118,6 @@ bool kernel_path_volume_bounce( #ifndef __SPLIT_KERNEL__ ccl_device void kernel_branched_path_volume_connect_light( KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *emission_sd, float3 throughput, @@ -150,12 +147,12 @@ ccl_device void kernel_branched_path_volume_connect_light( int num_samples = light_select_num_samples(kg, i); float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights); - RNG lamp_rng = cmj_hash(*rng, i); + uint lamp_rng_hash = cmj_hash(state->rng_hash, i); for(int j = 0; j < num_samples; j++) { /* sample random position on given light */ float light_u, light_v; - path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls); @@ -163,8 +160,8 @@ ccl_device void kernel_branched_path_volume_connect_light( float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); - float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + float rphase = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE); + float rscatter = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); @@ -177,12 +174,12 @@ ccl_device void kernel_branched_path_volume_connect_light( if(kernel_data.integrator.pdf_triangles != 0.0f) ls.pdf *= 2.0f; - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } @@ -198,9 +195,9 @@ ccl_device void kernel_branched_path_volume_connect_light( for(int j = 0; j < num_samples; j++) { /* sample random position on random triangle */ - float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT); + float light_t = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT); float light_u, light_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); /* only sample triangle lights */ if(kernel_data.integrator.num_all_lights) @@ -212,8 +209,8 @@ ccl_device void kernel_branched_path_volume_connect_light( float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); - float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + float rphase = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE); + float rscatter = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); @@ -226,12 +223,12 @@ ccl_device void kernel_branched_path_volume_connect_light( if(kernel_data.integrator.num_all_lights) ls.pdf *= 2.0f; - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } @@ -242,9 +239,9 @@ ccl_device void kernel_branched_path_volume_connect_light( } else { /* sample random position on random light */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_t = path_state_rng_1D(kg, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls); @@ -252,8 +249,8 @@ ccl_device void kernel_branched_path_volume_connect_light( float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); @@ -264,12 +261,12 @@ ccl_device void kernel_branched_path_volume_connect_light( /* todo: split up light_sample so we don't have to call it again with new position */ if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* sample random light */ - float terminate = path_state_rng_light_termination(kg, rng, state); + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp); } diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h index 96bc636d5ac..e32d4bbbc1b 100644 --- a/intern/cycles/kernel/kernel_queues.h +++ b/intern/cycles/kernel/kernel_queues.h @@ -128,6 +128,21 @@ ccl_device unsigned int get_global_queue_index( return my_gqidx; } +ccl_device int dequeue_ray_index( + int queue_number, + ccl_global int *queues, + int queue_size, + ccl_global int *queue_index) +{ + int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1; + + if(index < 0) { + return QUEUE_EMPTY_SLOT; + } + + return queues[index + queue_number * queue_size]; +} + CCL_NAMESPACE_END #endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index e8a912ccc0b..221d92f5de1 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -18,6 +18,16 @@ CCL_NAMESPACE_BEGIN +/* Pseudo random numbers, uncomment this for debugging correlations. Only run + * this single threaded on a CPU for repeatable resutls. */ +//#define __DEBUG_CORRELATION__ + + +/* High Dimensional Sobol. + * + * Multidimensional sobol with generator matrices. Dimension 0 and 1 are equal + * to classic Van der Corput and Sobol sequences. */ + #ifdef __SOBOL__ /* Skip initial numbers that are not as well distributed, especially the @@ -26,47 +36,6 @@ CCL_NAMESPACE_BEGIN */ #define SOBOL_SKIP 64 -/* High Dimensional Sobol. */ - -/* Van der Corput radical inverse. */ -ccl_device uint van_der_corput(uint bits) -{ - bits = (bits << 16) | (bits >> 16); - bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8); - bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4); - bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2); - bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1); - return bits; -} - -/* Sobol radical inverse. */ -ccl_device uint sobol(uint i) -{ - uint r = 0; - for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) { - if(i & 1) { - r ^= v; - } - } - return r; -} - -/* Inverse of sobol radical inverse. */ -ccl_device uint sobol_inverse(uint i) -{ - const uint msb = 1U << 31; - uint r = 0; - for(uint v = 1; i; i <<= 1, v ^= v << 1) { - if(i & msb) { - r ^= v; - } - } - return r; -} - -/* Multidimensional sobol with generator matrices - * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively. - */ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) { uint result = 0; @@ -79,50 +48,31 @@ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) return result; } -/* Lookup index and x/y coordinate, assumes m is a power of two. */ -ccl_device uint sobol_lookup(const uint m, - const uint frame, - const uint ex, - const uint ey, - uint *x, uint *y) -{ - /* Shift is constant per frame. */ - const uint shift = frame << (m << 1); - const uint sobol_shift = sobol(shift); - /* Van der Corput is its own inverse. */ - const uint lower = van_der_corput(ex << (32 - m)); - /* Need to compensate for ey difference and shift. */ - const uint sobol_lower = sobol(lower); - const uint mask = ~-(1 << m) << (32 - m); /* Only m upper bits. */ - const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask; - /* Only use m upper bits for the index (m is a power of two). */ - const uint sobol_result = delta | (delta >> m); - const uint upper = sobol_inverse(sobol_result); - const uint index = shift | upper | lower; - *x = van_der_corput(index); - *y = sobol_shift ^ sobol_result ^ sobol_lower; - return index; -} +#endif /* __SOBOL__ */ + ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, - RNG *rng, + uint rng_hash, int sample, int num_samples, int dimension) { +#ifdef __DEBUG_CORRELATION__ + return (float)drand48(); +#endif + #ifdef __CMJ__ - if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { +# ifdef __SOBOL__ + if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) +# endif + { /* Correlated multi-jitter. */ - int p = *rng + dimension; + int p = rng_hash + dimension; return cmj_sample_1D(sample, num_samples, p); } #endif -#ifdef __SOBOL_FULL_SCREEN__ - uint result = sobol_dimension(kg, *rng, dimension); - float r = (float)result * (1.0f/(float)0xFFFFFFFF); - return r; -#else - /* Compute sobol sequence value using direction vectors. */ +#ifdef __SOBOL__ + /* Sobol sequence value using direction vectors. */ uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension); float r = (float)result * (1.0f/(float)0xFFFFFFFF); @@ -132,7 +82,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, /* Hash rng with dimension to solve correlation issues. * See T38710, T50116. */ - RNG tmp_rng = cmj_hash_simple(dimension, *rng); + uint tmp_rng = cmj_hash_simple(dimension, rng_hash); shift = tmp_rng * (1.0f/(float)0xFFFFFFFF); return r + shift - floorf(r + shift); @@ -140,128 +90,60 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, } ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, - RNG *rng, + uint rng_hash, int sample, int num_samples, int dimension, float *fx, float *fy) { +#ifdef __DEBUG_CORRELATION__ + *fx = (float)drand48(); + *fy = (float)drand48(); + return; +#endif + #ifdef __CMJ__ - if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { +# ifdef __SOBOL__ + if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) +# endif + { /* Correlated multi-jitter. */ - int p = *rng + dimension; + int p = rng_hash + dimension; cmj_sample_2D(sample, num_samples, p, fx, fy); + return; } - else #endif - { - /* Sobol. */ - *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); - *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); - } + +#ifdef __SOBOL__ + /* Sobol. */ + *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension); + *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1); +#endif } ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, - RNG *rng, + uint *rng_hash, int x, int y, float *fx, float *fy) { -#ifdef __SOBOL_FULL_SCREEN__ - uint px, py; - uint bits = 16; /* limits us to 65536x65536 and 65536 samples */ - uint size = 1 << bits; - uint frame = sample; - - *rng = sobol_lookup(bits, frame, x, y, &px, &py); - - *rng ^= kernel_data.integrator.seed; - - if(sample == 0) { - *fx = 0.5f; - *fy = 0.5f; - } - else { - *fx = size * (float)px * (1.0f/(float)0xFFFFFFFF) - x; - *fy = size * (float)py * (1.0f/(float)0xFFFFFFFF) - y; - } -#else - *rng = *rng_state; - - *rng ^= kernel_data.integrator.seed; - - if(sample == 0) { - *fx = 0.5f; - *fy = 0.5f; - } - else { - path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy); - } -#endif -} - -ccl_device void path_rng_end(KernelGlobals *kg, - ccl_global uint *rng_state, - RNG rng) -{ - /* nothing to do */ -} - -#else /* __SOBOL__ */ - -/* Linear Congruential Generator */ - -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, - RNG *rng, - int sample, int num_samples, - int dimension) -{ - /* implicit mod 2^32 */ - *rng = (1103515245*(*rng) + 12345); - return (float)*rng * (1.0f/(float)0xFFFFFFFF); -} - -ccl_device_inline void path_rng_2D(KernelGlobals *kg, - RNG *rng, - int sample, int num_samples, - int dimension, - float *fx, float *fy) -{ - *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); - *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); -} - -ccl_device void path_rng_init(KernelGlobals *kg, - ccl_global uint *rng_state, - int sample, int num_samples, - RNG *rng, - int x, int y, - float *fx, float *fy) -{ /* load state */ - *rng = *rng_state; + *rng_hash = *rng_state; + *rng_hash ^= kernel_data.integrator.seed; - *rng ^= kernel_data.integrator.seed; +#ifdef __DEBUG_CORRELATION__ + srand48(*rng_hash + sample); +#endif if(sample == 0) { *fx = 0.5f; *fy = 0.5f; } else { - path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy); + path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy); } } -ccl_device void path_rng_end(KernelGlobals *kg, - ccl_global uint *rng_state, - RNG rng) -{ - /* store state for next sample */ - *rng_state = rng; -} - -#endif /* __SOBOL__ */ - /* Linear Congruential Generator */ ccl_device uint lcg_step_uint(uint *rng) @@ -295,19 +177,17 @@ ccl_device uint lcg_init(uint seed) */ ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, - RNG *rng, const ccl_addr_space PathState *state, int dimension) { return path_rng_1D(kg, - rng, + state->rng_hash, state->sample, state->num_samples, state->rng_offset + dimension); } ccl_device_inline float path_state_rng_1D_for_decision( KernelGlobals *kg, - RNG *rng, const ccl_addr_space PathState *state, int dimension) { @@ -320,19 +200,18 @@ ccl_device_inline float path_state_rng_1D_for_decision( * the same decision. */ const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM; return path_rng_1D(kg, - rng, + state->rng_hash, state->sample, state->num_samples, rng_offset + dimension); } ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, - RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) { path_rng_2D(kg, - rng, + state->rng_hash, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); @@ -340,14 +219,14 @@ ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_device_inline float path_branched_rng_1D( KernelGlobals *kg, - RNG *rng, + uint rng_hash, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension) { return path_rng_1D(kg, - rng, + rng_hash, state->sample * num_branches + branch, state->num_samples * num_branches, state->rng_offset + dimension); @@ -355,7 +234,7 @@ ccl_device_inline float path_branched_rng_1D( ccl_device_inline float path_branched_rng_1D_for_decision( KernelGlobals *kg, - RNG *rng, + uint rng_hash, const ccl_addr_space PathState *state, int branch, int num_branches, @@ -363,7 +242,7 @@ ccl_device_inline float path_branched_rng_1D_for_decision( { const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM; return path_rng_1D(kg, - rng, + rng_hash, state->sample * num_branches + branch, state->num_samples * num_branches, rng_offset + dimension); @@ -371,7 +250,7 @@ ccl_device_inline float path_branched_rng_1D_for_decision( ccl_device_inline void path_branched_rng_2D( KernelGlobals *kg, - RNG *rng, + uint rng_hash, const ccl_addr_space PathState *state, int branch, int num_branches, @@ -379,7 +258,7 @@ ccl_device_inline void path_branched_rng_2D( float *fx, float *fy) { path_rng_2D(kg, - rng, + rng_hash, state->sample * num_branches + branch, state->num_samples * num_branches, state->rng_offset + dimension, @@ -391,25 +270,24 @@ ccl_device_inline void path_branched_rng_2D( */ ccl_device_inline float path_state_rng_light_termination( KernelGlobals *kg, - RNG *rng, const ccl_addr_space PathState *state) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE); + return path_state_rng_1D_for_decision(kg, state, PRNG_LIGHT_TERMINATE); } return 0.0f; } ccl_device_inline float path_branched_rng_light_termination( KernelGlobals *kg, - RNG *rng, + uint rng_hash, const ccl_addr_space PathState *state, int branch, int num_branches) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { return path_branched_rng_1D_for_decision(kg, - rng, + rng_hash, state, branch, num_branches, @@ -429,14 +307,19 @@ ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, state->num_samples = state->num_samples*num_branches; } -ccl_device_inline uint lcg_state_init(RNG *rng, - int rng_offset, - int sample, +ccl_device_inline uint lcg_state_init(PathState *state, uint scramble) { - return lcg_init(*rng + rng_offset + sample*scramble); + return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble); } +ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state, + uint scramble) +{ + return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble); +} + + ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng) { /* Implicit mod 2^32 */ diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index c66f52255f0..dd64f5b05ba 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -83,7 +83,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, float4 curvedata = kernel_tex_fetch(__curves, sd->prim); sd->shader = __float_as_int(curvedata.z); - sd->P = bvh_curve_refine(kg, sd, isect, ray); + sd->P = curve_refine(kg, sd, isect, ray); } else #endif @@ -669,7 +669,7 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn } } -ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd) { if(sd->flag & SD_HAS_ONLY_VOLUME) return make_float3(1.0f, 1.0f, 1.0f); @@ -677,7 +677,7 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) float3 eval = make_float3(0.0f, 0.0f, 0.0f); for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = &sd->closure[i]; if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl eval += sc->weight; @@ -764,6 +764,19 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd) return eval; } +ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd) +{ + float3 N = make_float3(0.0f, 0.0f, 0.0f); + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + N += sc->N*average(sc->weight); + } + + return (is_zero(N))? sd->N : normalize(N); +} + ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); @@ -783,12 +796,7 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac } } - if(is_zero(N)) - N = sd->N; - else - N = normalize(N); - - *N_ = N; + *N_ = (is_zero(N))? sd->N : normalize(N); return eval; } @@ -863,8 +871,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) /* Surface Evaluation */ -ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng, - ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx) +ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, + ccl_addr_space PathState *state, float randb, int path_flag) { sd->num_closure = 0; sd->num_closure_extra = 0; @@ -872,7 +880,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng, #ifdef __OSL__ if(kg->osl) - OSLShader::eval_surface(kg, sd, state, path_flag, ctx); + OSLShader::eval_surface(kg, sd, state, path_flag); else #endif { @@ -887,15 +895,15 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng, #endif } - if(rng && (sd->flag & SD_BSDF_NEEDS_LCG)) { - sd->lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0xb4bc3953); + if(sd->flag & SD_BSDF_NEEDS_LCG) { + sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953); } } /* Background Evaluation */ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, - ccl_addr_space PathState *state, int path_flag, ShaderContext ctx) + ccl_addr_space PathState *state, int path_flag) { sd->num_closure = 0; sd->num_closure_extra = 0; @@ -904,7 +912,7 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, #ifdef __SVM__ #ifdef __OSL__ if(kg->osl) { - OSLShader::eval_background(kg, sd, state, path_flag, ctx); + OSLShader::eval_background(kg, sd, state, path_flag); } else #endif @@ -1039,8 +1047,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ccl_addr_space VolumeStack *stack, - int path_flag, - ShaderContext ctx) + int path_flag) { /* reset closures once at the start, we will be accumulating the closures * for all volumes in the stack into a single array of closures */ @@ -1073,7 +1080,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, #ifdef __SVM__ # ifdef __OSL__ if(kg->osl) { - OSLShader::eval_volume(kg, sd, state, path_flag, ctx); + OSLShader::eval_volume(kg, sd, state, path_flag); } else # endif @@ -1092,7 +1099,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, /* Displacement Evaluation */ -ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx) +ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state) { sd->num_closure = 0; sd->num_closure_extra = 0; @@ -1102,7 +1109,7 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_ #ifdef __SVM__ # ifdef __OSL__ if(kg->osl) - OSLShader::eval_displacement(kg, sd, ctx); + OSLShader::eval_displacement(kg, sd); else # endif { diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index fab5946970d..22e085e94da 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -49,11 +49,9 @@ ccl_device_forceinline bool shadow_handle_transparent_isect( path_state_modify_bounce(state, true); shader_eval_surface(kg, shadow_sd, - NULL, state, 0.0f, - PATH_RAY_SHADOW, - SHADER_CONTEXT_SHADOW); + PATH_RAY_SHADOW); path_state_modify_bounce(state, false); *throughput *= shader_bsdf_transparency(kg, shadow_sd); } @@ -72,13 +70,14 @@ ccl_device_forceinline bool shadow_handle_transparent_isect( ccl_device bool shadow_blocked_opaque(KernelGlobals *kg, ShaderData *shadow_sd, ccl_addr_space PathState *state, + const uint visibility, Ray *ray, Intersection *isect, float3 *shadow) { const bool blocked = scene_intersect(kg, *ray, - PATH_RAY_SHADOW_OPAQUE, + visibility & PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f); @@ -120,15 +119,49 @@ ccl_device bool shadow_blocked_opaque(KernelGlobals *kg, # define SHADOW_STACK_MAX_HITS 64 +# ifdef __VOLUME__ +struct VolumeState { +# ifdef __SPLIT_KERNEL__ +# else + PathState ps; +# endif +}; + +/* Get PathState ready for use for volume stack evaluation. */ +ccl_device_inline PathState *shadow_blocked_volume_path_state( + KernelGlobals *kg, + VolumeState *volume_state, + ccl_addr_space PathState *state, + ShaderData *sd, + Ray *ray) +{ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space PathState *ps = + &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; +# else + PathState *ps = &volume_state->ps; +# endif + *ps = *state; + /* We are checking for shadow on the "other" side of the surface, so need + * to discard volume we are currently at. + */ + if(dot(sd->Ng, ray->D) < 0.0f) { + kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack); + } + return ps; +} +#endif // __VOLUME__ + /* Actual logic with traversal loop implementation which is free from device * specific tweaks. * * Note that hits array should be as big as max_hits+1. */ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, - const int skip_object, + const uint visibility, Ray *ray, Intersection *hits, uint max_hits, @@ -141,9 +174,12 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, const bool blocked = scene_intersect_shadow_all(kg, ray, hits, - skip_object, + visibility, max_hits, &num_hits); +# ifdef __VOLUME__ + VolumeState volume_state; +# endif /* If no opaque surface found but we did find transparent hits, * shade them. */ @@ -154,13 +190,11 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, int bounce = state->transparent_bounce; Intersection *isect = hits; # ifdef __VOLUME__ -# ifdef __SPLIT_KERNEL__ - ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; -# else - PathState ps_object; - PathState *ps = &ps_object; -# endif - *ps = *state; + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); # endif sort_intersections(hits, num_hits); for(int hit = 0; hit < num_hits; hit++, isect++) { @@ -205,8 +239,13 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, } # ifdef __VOLUME__ if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { - /* Apply attenuation from current volume shader/ */ - kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); + /* Apply attenuation from current volume shader. */ + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); + kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow); } # endif return blocked; @@ -216,9 +255,10 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, * loop to help readability of the actual logic. */ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, - const int skip_object, + const uint visibility, Ray *ray, uint max_hits, float3 *shadow) @@ -251,9 +291,10 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, # endif /* __KERNEL_GPU__ */ /* Invoke actual traversal. */ return shadow_blocked_transparent_all_loop(kg, + sd, shadow_sd, state, - skip_object, + visibility, ray, hits, max_hits, @@ -276,27 +317,29 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, */ ccl_device bool shadow_blocked_transparent_stepped_loop( KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, - const int skip_object, + const uint visibility, Ray *ray, Intersection *isect, const bool blocked, const bool is_transparent_isect, float3 *shadow) { - if((blocked && is_transparent_isect) || skip_object != OBJECT_NONE) { +# ifdef __VOLUME__ + VolumeState volume_state; +# endif + if(blocked && is_transparent_isect) { float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float3 Pend = ray->P + ray->D*ray->t; int bounce = state->transparent_bounce; # ifdef __VOLUME__ -# ifdef __SPLIT_KERNEL__ - ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; -# else - PathState ps_object; - PathState *ps = &ps_object; -# endif - *ps = *state; + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); # endif for(;;) { if(bounce >= kernel_data.integrator.transparent_max_bounce) { @@ -304,30 +347,13 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( } if(!scene_intersect(kg, *ray, - PATH_RAY_SHADOW_TRANSPARENT, + visibility & PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f)) { break; } -#ifdef __SHADOW_TRICKS__ - if(skip_object != OBJECT_NONE) { - const int isect_object = (isect->object == PRIM_NONE) - ? kernel_tex_fetch(__prim_object, isect->prim) - : isect->object; - if(isect_object == skip_object) { - shader_setup_from_ray(kg, shadow_sd, isect, ray); - /* Move ray forward. */ - ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng); - if(ray->t != FLT_MAX) { - ray->D = normalize_len(Pend - ray->P, &ray->t); - } - bounce++; - continue; - } - } -#endif if(!shader_transparent_shadow(kg, isect)) { return true; } @@ -363,7 +389,12 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( # ifdef __VOLUME__ if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { /* Apply attenuation from current volume shader. */ - kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); + kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow); } # endif return blocked; @@ -371,33 +402,28 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( ccl_device bool shadow_blocked_transparent_stepped( KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, - const int skip_object, + const uint visibility, Ray *ray, Intersection *isect, float3 *shadow) { - bool blocked, is_transparent_isect; - if(skip_object == OBJECT_NONE) { - blocked = scene_intersect(kg, - *ray, - PATH_RAY_SHADOW_OPAQUE, - isect, - NULL, - 0.0f, 0.0f); - is_transparent_isect = blocked - ? shader_transparent_shadow(kg, isect) - : false; - } - else { - blocked = false; - is_transparent_isect = false; - } + bool blocked = scene_intersect(kg, + *ray, + visibility & PATH_RAY_SHADOW_OPAQUE, + isect, + NULL, + 0.0f, 0.0f); + bool is_transparent_isect = blocked + ? shader_transparent_shadow(kg, isect) + : false; return shadow_blocked_transparent_stepped_loop(kg, + sd, shadow_sd, state, - skip_object, + visibility, ray, isect, blocked, @@ -409,6 +435,7 @@ ccl_device bool shadow_blocked_transparent_stepped( #endif /* __TRANSPARENT_SHADOWS__ */ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, Ray *ray_input, @@ -422,25 +449,24 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, return false; } #ifdef __SHADOW_TRICKS__ - const int skip_object = state->catcher_object; + const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) + ? PATH_RAY_SHADOW_NON_CATCHER + : PATH_RAY_SHADOW; #else - const int skip_object = OBJECT_NONE; + const uint visibility = PATH_RAY_SHADOW; #endif /* Do actual shadow shading. */ /* First of all, we check if integrator requires transparent shadows. * if not, we use simplest and fastest ever way to calculate occlusion. - * - * NOTE: We can't do quick opaque test here if we are on shadow-catcher - * path because we don't want catcher object to be casting shadow here. */ #ifdef __TRANSPARENT_SHADOWS__ - if(!kernel_data.integrator.transparent_shadows && - skip_object == OBJECT_NONE) + if(!kernel_data.integrator.transparent_shadows) #endif { return shadow_blocked_opaque(kg, shadow_sd, state, + visibility, ray, &isect, shadow); @@ -467,7 +493,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, */ const bool blocked = scene_intersect(kg, *ray, - PATH_RAY_SHADOW_OPAQUE, + visibility & PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); @@ -478,9 +504,10 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, max_hits + 1 >= SHADOW_STACK_MAX_HITS) { return shadow_blocked_transparent_stepped_loop(kg, + sd, shadow_sd, state, - skip_object, + visibility, ray, &isect, blocked, @@ -489,9 +516,10 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, } # endif /* __KERNEL_GPU__ */ return shadow_blocked_transparent_all(kg, + sd, shadow_sd, state, - skip_object, + visibility, ray, max_hits, shadow); @@ -500,7 +528,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, return shadow_blocked_transparent_stepped(kg, shadow_sd, state, - skip_object, + visibility, ray, &isect, shadow); diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index 1026cde7b29..26ec6383b73 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -219,7 +219,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, if(bump || texture_blur > 0.0f) { /* average color and normal at incoming point */ - shader_eval_surface(kg, sd, NULL, state, 0.0f, state_flag, SHADER_CONTEXT_SSS); + shader_eval_surface(kg, sd, state, 0.0f, state_flag); float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL); /* we simply divide out the average color and multiply with the average @@ -243,7 +243,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect( SubsurfaceIntersection *ss_isect, ShaderData *sd, ShaderClosure *sc, - RNG *lcg_state, + uint *lcg_state, float disk_u, float disk_v, bool all) @@ -418,7 +418,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup( } /* subsurface scattering step, from a point on the surface to another nearby point on the same object */ -ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_global PathState *state, +ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index cb1a3f40dee..5eab28a2953 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -82,115 +82,110 @@ KERNEL_TEX(uint, texture_uint, __sobol_directions) # if __CUDA_ARCH__ < 300 /* full-float image */ KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004) - -/* image */ -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_008) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_016) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_024) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_032) + +/* image + * These texture names are encoded to their flattened slots as + * ImageManager::type_index_to_flattened_slot() returns them. */ +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665) # else /* bindless textures */ KERNEL_TEX(uint, texture_uint, __bindless_mapping) -# endif -#endif - -/* packed image (opencl) */ -KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed) -KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed) -KERNEL_TEX(uchar, texture_uchar, __tex_image_byte_packed) -KERNEL_TEX(float, texture_float, __tex_image_float_packed) -KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info) +# endif /* __CUDA_ARCH__ */ +#endif /* __KERNEL_CUDA__ */ #undef KERNEL_TEX #undef KERNEL_IMAGE_TEX diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index dbeaffdfb24..8f65c00491c 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -130,12 +130,13 @@ CCL_NAMESPACE_BEGIN # ifdef __KERNEL_OPENCL_APPLE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __PRINCIPLED__ # define __CMJ__ /* TODO(sergey): Currently experimental section is ignored here, * this is because megakernel in device_opencl does not support * custom cflags depending on the scene features. */ -# endif /* __KERNEL_OPENCL_NVIDIA__ */ +# endif /* __KERNEL_OPENCL_APPLE__ */ # ifdef __KERNEL_OPENCL_AMD__ # define __CL_USE_NATIVE__ @@ -154,6 +155,7 @@ CCL_NAMESPACE_BEGIN # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __PRINCIPLED__ # define __CMJ__ # endif /* __KERNEL_OPENCL_INTEL_CPU__ */ @@ -236,10 +238,9 @@ CCL_NAMESPACE_BEGIN #ifdef __NO_PRINCIPLED__ # undef __PRINCIPLED__ #endif - -/* Random Numbers */ - -typedef uint RNG; +#ifdef __NO_DENOISING__ +# undef __DENOISING_FEATURES__ +#endif /* Shader Evaluation */ @@ -325,24 +326,28 @@ enum PathRayFlag { PATH_RAY_SINGULAR = (1 << 5), PATH_RAY_TRANSPARENT = (1 << 6), - PATH_RAY_SHADOW_OPAQUE = (1 << 7), - PATH_RAY_SHADOW_TRANSPARENT = (1 << 8), - PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), + PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7), + PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8), + PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_OPAQUE_CATCHER), + PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9), + PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10), + PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_CATCHER), + PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER), + PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), - PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */ - PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */ + PATH_RAY_CURVE = (1 << 11), /* visibility flag to define curve segments */ + PATH_RAY_VOLUME_SCATTER = (1 << 12), /* volume scattering */ /* Special flag to tag unaligned BVH nodes. */ - PATH_RAY_NODE_UNALIGNED = (1 << 11), + PATH_RAY_NODE_UNALIGNED = (1 << 13), - PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1), + PATH_RAY_ALL_VISIBILITY = ((1 << 14)-1), - PATH_RAY_MIS_SKIP = (1 << 12), - PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13), - PATH_RAY_SINGLE_PASS_DONE = (1 << 14), - PATH_RAY_SHADOW_CATCHER = (1 << 15), - PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16), - PATH_RAY_STORE_SHADOW_INFO = (1 << 17), + PATH_RAY_MIS_SKIP = (1 << 15), + PATH_RAY_DIFFUSE_ANCESTOR = (1 << 16), + PATH_RAY_SINGLE_PASS_DONE = (1 << 17), + PATH_RAY_SHADOW_CATCHER = (1 << 18), + PATH_RAY_STORE_SHADOW_INFO = (1 << 19), }; /* Closure Label */ @@ -459,11 +464,24 @@ typedef enum DenoiseFlag { DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1, } DenoiseFlag; +#ifdef __KERNEL_DEBUG__ +/* NOTE: This is a runtime-only struct, alignment is not + * really important here. + */ +typedef struct DebugData { + int num_bvh_traversed_nodes; + int num_bvh_traversed_instances; + int num_bvh_intersections; + int num_ray_bounces; +} DebugData; +#endif + typedef ccl_addr_space struct PathRadiance { #ifdef __PASSES__ int use_light_pass; #endif + float transparent; float3 emission; #ifdef __PASSES__ float3 background; @@ -512,7 +530,16 @@ typedef ccl_addr_space struct PathRadiance { float3 path_total_shaded; /* Color of the background on which shadow is alpha-overed. */ - float3 shadow_color; + float3 shadow_background_color; + + /* Path radiance sum and throughput at the moment when ray hits shadow + * catcher object. + */ + float3 shadow_radiance_sum; + float shadow_throughput; + + /* Accumulated transparency along the path after shadow catcher bounce. */ + float shadow_transparency; #endif #ifdef __DENOISING_FEATURES__ @@ -520,6 +547,10 @@ typedef ccl_addr_space struct PathRadiance { float3 denoising_albedo; float denoising_depth; #endif /* __DENOISING_FEATURES__ */ + +#ifdef __KERNEL_DEBUG__ + DebugData debug_data; +#endif /* __KERNEL_DEBUG__ */ } PathRadiance; typedef struct BsdfEval { @@ -771,20 +802,6 @@ typedef ccl_addr_space struct ccl_align(16) ShaderClosure { float data[10]; /* pad to 80 bytes */ } ShaderClosure; -/* Shader Context - * - * For OSL we recycle a fixed number of contexts for speed */ - -typedef enum ShaderContext { - SHADER_CONTEXT_MAIN = 0, - SHADER_CONTEXT_INDIRECT = 1, - SHADER_CONTEXT_EMISSION = 2, - SHADER_CONTEXT_SHADOW = 3, - SHADER_CONTEXT_SSS = 4, - SHADER_CONTEXT_VOLUME = 5, - SHADER_CONTEXT_NUM = 6 -} ShaderContext; - /* Shader Data * * Main shader state at a point on the surface or in a volume. All coordinates @@ -847,7 +864,7 @@ enum ShaderDataFlag { SD_VOLUME_MIS = (1 << 23), /* Use cubic interpolation for voxels. */ SD_VOLUME_CUBIC = (1 << 24), - /* Has data connected to the displacement input. */ + /* Has data connected to the displacement input or uses bump map. */ SD_HAS_BUMP = (1 << 25), /* Has true displacement. */ SD_HAS_DISPLACEMENT = (1 << 26), @@ -988,6 +1005,7 @@ typedef struct PathState { int flag; /* random number generator state */ + uint rng_hash; /* per pixel hash */ int rng_offset; /* dimension offset */ int sample; /* path sample number */ int num_samples; /* total number of times this path will be sampled */ @@ -1013,13 +1031,9 @@ typedef struct PathState { /* volume rendering */ #ifdef __VOLUME__ int volume_bounce; - RNG rng_congruential; + uint rng_congruential; VolumeStack volume_stack[VOLUME_STACK_SIZE]; #endif - -#ifdef __SHADOW_TRICKS__ - int catcher_object; -#endif } PathState; /* Subsurface */ @@ -1225,7 +1239,6 @@ typedef struct KernelIntegrator { int portal_offset; /* bounces */ - int min_bounce; int max_bounce; int max_diffuse_bounce; @@ -1236,7 +1249,6 @@ typedef struct KernelIntegrator { int ao_bounces; /* transparent */ - int transparent_min_bounce; int transparent_max_bounce; int transparent_shadows; @@ -1279,7 +1291,7 @@ typedef struct KernelIntegrator { float light_inv_rr_threshold; int start_sample; - int pad1, pad2, pad3; + int pad1; } KernelIntegrator; static_assert_align(KernelIntegrator, 16); @@ -1333,18 +1345,6 @@ typedef struct KernelData { } KernelData; static_assert_align(KernelData, 16); -#ifdef __KERNEL_DEBUG__ -/* NOTE: This is a runtime-only struct, alignment is not - * really important here. - */ -typedef ccl_addr_space struct DebugData { - int num_bvh_traversed_nodes; - int num_bvh_traversed_instances; - int num_bvh_intersections; - int num_ray_bounces; -} DebugData; -#endif - /* Declarations required for split kernel */ /* Macro for queues */ @@ -1387,6 +1387,8 @@ enum QueueNumber { #ifdef __BRANCHED_PATH__ /* All rays moving to next iteration of the indirect loop for light */ QUEUE_LIGHT_INDIRECT_ITER, + /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */ + QUEUE_INACTIVE_RAYS, # ifdef __VOLUME__ /* All rays moving to next iteration of the indirect loop for volumes */ QUEUE_VOLUME_INDIRECT_ITER, @@ -1429,6 +1431,9 @@ enum RayState { RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5), RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6), RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT), + + /* Ray is evaluating an iteration of an indirect loop for another thread */ + RAY_BRANCHED_INDIRECT_SHARED = (1 << 7), }; #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state)) diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index 9c0878249d4..42094a9c3f8 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -43,7 +43,7 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg, float3 *extinction) { sd->P = P; - shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); + shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW); if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER))) return false; @@ -69,7 +69,7 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg, VolumeShaderCoefficients *coeff) { sd->P = P; - shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME); + shader_eval_volume(kg, sd, state, state->volume_stack, state->flag); if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER|SD_EMISSION))) return false; @@ -360,7 +360,6 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous( ShaderData *sd, PathRadiance *L, ccl_addr_space float3 *throughput, - RNG *rng, bool probalistic_scatter) { VolumeShaderCoefficients coeff; @@ -380,13 +379,13 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); + float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE); int channel = (int)(rphase*3.0f); sd->randb_closure = rphase*3.0f - channel; /* decide if we will hit or miss */ bool scatter = true; - float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float xi = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE); if(probalistic_scatter) { float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); @@ -468,8 +467,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( Ray *ray, ShaderData *sd, PathRadiance *L, - ccl_addr_space float3 *throughput, - RNG *rng) + ccl_addr_space float3 *throughput) { float3 tp = *throughput; const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ @@ -485,8 +483,8 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); + float xi = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE); int channel = (int)(rphase*3.0f); sd->randb_closure = rphase*3.0f - channel; bool has_scatter = false; @@ -610,15 +608,14 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate( Ray *ray, PathRadiance *L, ccl_addr_space float3 *throughput, - RNG *rng, bool heterogeneous) { shader_setup_from_volume(kg, sd, ray); if(heterogeneous) - return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, rng); + return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput); else - return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng, true); + return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true); } #ifndef __SPLIT_KERNEL__ @@ -660,6 +657,7 @@ typedef struct VolumeSegment { * but the entire segment is needed to do always scattering, rather than probabilistically * hitting or missing the volume. if we don't know the transmittance at the end of the * volume we can't generate stratified distance samples up to that transmittance */ +#ifdef __VOLUME_DECOUPLED__ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous) { @@ -829,6 +827,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s #endif } } +#endif /* __VOLUME_DECOUPLED__ */ /* scattering for homogeneous and heterogeneous volumes, using decoupled ray * marching. diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index ffd34c293fc..2ed713299fd 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -107,8 +107,6 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, float *difference_image, float *buffer, - float *color_pass, - float *variance_pass, float *transform, int *rank, float *XtWX, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 261176846b1..8dc1a8d583c 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -213,8 +213,6 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, float *difference_image, float *buffer, - float *color_pass, - float *variance_pass, float *transform, int *rank, float *XtWX, @@ -229,7 +227,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); #else - kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride); + kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride); #endif } diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp index 1a7b2040da1..254025be4e2 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -25,6 +25,7 @@ #else /* SSE optimization disabled for now on 32 bit, see bug #36316 */ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 9895080d328..c8938534fe8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -85,6 +85,7 @@ DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting) DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive) DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 9b85a864153..d4315ee5ec4 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -53,6 +53,7 @@ # include "kernel/split/kernel_direct_lighting.h" # include "kernel/split/kernel_shadow_blocked_ao.h" # include "kernel/split/kernel_shadow_blocked_dl.h" +# include "kernel/split/kernel_enqueue_inactive.h" # include "kernel/split/kernel_next_iteration_setup.h" # include "kernel/split/kernel_indirect_subsurface.h" # include "kernel/split/kernel_buffer_update.h" @@ -230,6 +231,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index 2edbff08087..009c3fde9d5 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -207,8 +207,6 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, const float *ccl_restrict difference_image, const float *ccl_restrict buffer, - float *color_pass, - float *variance_pass, float const* __restrict__ transform, int *rank, float *XtWX, @@ -225,7 +223,6 @@ kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, dx, dy, difference_image, buffer, - color_pass, variance_pass, transform, rank, XtWX, XtWY, rect, filter_rect, diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h index 9fa39dc9ebb..7ae205b7e14 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_config.h +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -81,8 +81,13 @@ # error "Unknown or unsupported CUDA architecture, can't determine launch bounds" #endif -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ +/* For split kernel using all registers seems fastest for now, but this + * is unlikely to be optimal once we resolve other bottlenecks. */ + +#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ __launch_bounds__( \ diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index 8b7f1a8d405..e97e87285a5 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -39,6 +39,7 @@ #include "kernel/split/kernel_direct_lighting.h" #include "kernel/split/kernel_shadow_blocked_ao.h" #include "kernel/split/kernel_shadow_blocked_dl.h" +#include "kernel/split/kernel_enqueue_inactive.h" #include "kernel/split/kernel_next_iteration_setup.h" #include "kernel/split/kernel_indirect_subsurface.h" #include "kernel/split/kernel_buffer_update.h" @@ -89,7 +90,7 @@ kernel_cuda_path_trace_data_init( #define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ kernel_cuda_##name() \ { \ kernel_##name(NULL); \ @@ -97,7 +98,7 @@ kernel_cuda_path_trace_data_init( #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ kernel_cuda_##name() \ { \ ccl_local type locals; \ @@ -118,6 +119,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index 0462ca6f9bc..ba53ba4b26f 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -207,8 +207,6 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, int dy, const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, - ccl_global float *color_pass, - ccl_global float *variance_pass, const ccl_global float *ccl_restrict transform, ccl_global int *rank, ccl_global float *XtWX, @@ -227,7 +225,6 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, dx, dy, difference_image, buffer, - color_pass, variance_pass, transform, rank, XtWX, XtWY, rect, filter_rect, diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index 078acc1631e..b7108f3d0f8 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -52,9 +52,7 @@ __kernel void kernel_ocl_path_trace( ccl_global float *buffer, ccl_global uint *rng_state, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, int sample, int sx, int sy, int sw, int sh, int offset, int stride) @@ -63,9 +61,8 @@ __kernel void kernel_ocl_path_trace( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); int y = sy + ccl_global_id(1); @@ -82,9 +79,7 @@ __kernel void kernel_ocl_shader( ccl_global float4 *output, ccl_global float *output_luma, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, int type, int sx, int sw, int offset, int sample) { @@ -92,9 +87,8 @@ __kernel void kernel_ocl_shader( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); @@ -114,9 +108,7 @@ __kernel void kernel_ocl_bake( ccl_global uint4 *input, ccl_global float4 *output, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, int type, int filter, int sx, int sw, int offset, int sample) { @@ -124,9 +116,8 @@ __kernel void kernel_ocl_bake( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); @@ -144,9 +135,7 @@ __kernel void kernel_ocl_convert_to_byte( ccl_global uchar4 *rgba, ccl_global float *buffer, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -155,9 +144,8 @@ __kernel void kernel_ocl_convert_to_byte( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); int y = sy + ccl_global_id(1); @@ -171,9 +159,7 @@ __kernel void kernel_ocl_convert_to_half_float( ccl_global uchar4 *rgba, ccl_global float *buffer, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -182,9 +168,8 @@ __kernel void kernel_ocl_convert_to_half_float( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); int y = sy + ccl_global_id(1); @@ -193,7 +178,7 @@ __kernel void kernel_ocl_convert_to_half_float( kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } -__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset) +__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset) { size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl index db65c91baf7..dcea2630aef 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_buffer_update.h" -__kernel void kernel_ocl_path_trace_buffer_update( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME buffer_update +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl index 8b85d362f8a..95b35e40a45 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl @@ -25,11 +25,7 @@ __kernel void kernel_ocl_path_trace_data_init( int num_elements, ccl_global char *ray_state, ccl_global uint *rng_state, - -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" - + KERNEL_BUFFER_PARAMS, int start_sample, int end_sample, int sx, int sy, int sw, int sh, int offset, int stride, @@ -46,10 +42,7 @@ __kernel void kernel_ocl_path_trace_data_init( num_elements, ray_state, rng_state, - -#define KERNEL_TEX(type, ttype, name) name, -#include "kernel/kernel_textures.h" - + KERNEL_BUFFER_ARGS, start_sample, end_sample, sx, sy, sw, sh, offset, stride, diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl index eb34f750881..ed64ae01aae 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_direct_lighting.h" -__kernel void kernel_ocl_path_trace_direct_lighting( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME direct_lighting +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl index 83ef5f5f3f2..8afaa686e28 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_do_volume.h" -__kernel void kernel_ocl_path_trace_do_volume( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_do_volume((KernelGlobals*)kg); -} +#define KERNEL_NAME do_volume +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl new file mode 100644 index 00000000000..e68d4104a91 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_enqueue_inactive.h" + +#define KERNEL_NAME enqueue_inactive +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl index d071b39aa6f..9e1e57beba6 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl @@ -18,12 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" -__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local BackgroundAOLocals locals; - kernel_holdout_emission_blurring_pathtermination_ao( - (KernelGlobals*)kg, - &locals); -} +#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao +#define LOCALS_TYPE BackgroundAOLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl index 8c213ff5cb2..192d01444ba 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_indirect_background.h" -__kernel void kernel_ocl_path_trace_indirect_background( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_indirect_background((KernelGlobals*)kg); -} +#define KERNEL_NAME indirect_background +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl index 998ebc4c0c3..84938b889e5 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_indirect_subsurface.h" -__kernel void kernel_ocl_path_trace_indirect_subsurface( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_indirect_subsurface((KernelGlobals*)kg); -} +#define KERNEL_NAME indirect_subsurface +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl index 822d2287715..c314dc96c33 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_lamp_emission.h" -__kernel void kernel_ocl_path_trace_lamp_emission( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_lamp_emission((KernelGlobals*)kg); -} +#define KERNEL_NAME lamp_emission +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl index 6d207253a40..8b1332bf013 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_next_iteration_setup.h" -__kernel void kernel_ocl_path_trace_next_iteration_setup( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME next_iteration_setup +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl index bd9aa9538c8..fa210e747c0 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_path_init.h" -__kernel void kernel_ocl_path_trace_path_init( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_path_init((KernelGlobals*)kg); -} +#define KERNEL_NAME path_init +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl index 9be154e3d75..68ee6f1d536 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_queue_enqueue.h" -__kernel void kernel_ocl_path_trace_queue_enqueue( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local QueueEnqueueLocals locals; - kernel_queue_enqueue((KernelGlobals*)kg, &locals); -} +#define KERNEL_NAME queue_enqueue +#define LOCALS_TYPE QueueEnqueueLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl index eb4fb4d153a..10d09377ba9 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_scene_intersect.h" -__kernel void kernel_ocl_path_trace_scene_intersect( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_scene_intersect((KernelGlobals*)kg); -} +#define KERNEL_NAME scene_intersect +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl index 5bfb31b193a..40eaa561863 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_shader_eval.h" -__kernel void kernel_ocl_path_trace_shader_eval( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_shader_eval((KernelGlobals*)kg); -} +#define KERNEL_NAME shader_eval +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl index 38bfd04ad4c..8c36100f762 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_shader_setup.h" -__kernel void kernel_ocl_path_trace_shader_setup( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_shader_setup((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME shader_setup +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl index 6f722915d45..bcacaa4a054 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl @@ -19,10 +19,9 @@ #include "kernel/split/kernel_shader_sort.h" __attribute__((reqd_work_group_size(64, 1, 1))) -__kernel void kernel_ocl_path_trace_shader_sort( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local ShaderSortLocals locals; - kernel_shader_sort((KernelGlobals*)kg, &locals); -} +#define KERNEL_NAME shader_sort +#define LOCALS_TYPE ShaderSortLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl index 6a8ef81b32a..8de250a375c 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_shadow_blocked_ao.h" -__kernel void kernel_ocl_path_trace_shadow_blocked_ao( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_shadow_blocked_ao((KernelGlobals*)kg); -} +#define KERNEL_NAME shadow_blocked_ao +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl index b255cc5ef8b..29da77022ed 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_shadow_blocked_dl.h" -__kernel void kernel_ocl_path_trace_shadow_blocked_dl( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_shadow_blocked_dl((KernelGlobals*)kg); -} +#define KERNEL_NAME shadow_blocked_dl +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl index 8de82db7afe..4cbda1bc2e7 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl @@ -14,6 +14,9 @@ * limitations under the License. */ +#include "kernel/kernel_compat_opencl.h" // PRECOMPILED +#include "kernel/split/kernel_split_common.h" // PRECOMPILED + #include "kernel/kernels/opencl/kernel_state_buffer_size.cl" #include "kernel/kernels/opencl/kernel_data_init.cl" #include "kernel/kernels/opencl/kernel_path_init.cl" @@ -31,6 +34,7 @@ #include "kernel/kernels/opencl/kernel_direct_lighting.cl" #include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl" #include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl" +#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl" #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl" #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl" #include "kernel/kernels/opencl/kernel_buffer_update.cl" diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h new file mode 100644 index 00000000000..591c3846ef2 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h @@ -0,0 +1,67 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define KERNEL_NAME_JOIN(a, b) a ## _ ## b +#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b) + +__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( + ccl_global char *kg_global, + ccl_constant KernelData *data, + + ccl_global void *split_data_buffer, + ccl_global char *ray_state, + ccl_global uint *rng_state, + + KERNEL_BUFFER_PARAMS, + + ccl_global int *queue_index, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pools, + ccl_global float *buffer + ) +{ +#ifdef LOCALS_TYPE + ccl_local LOCALS_TYPE locals; +#endif + + KernelGlobals *kg = (KernelGlobals*)kg_global; + + if(ccl_local_id(0) + ccl_local_id(1) == 0) { + kg->data = data; + + kernel_split_params.rng_state = rng_state; + kernel_split_params.queue_index = queue_index; + kernel_split_params.use_queues_flag = use_queues_flag; + kernel_split_params.work_pools = work_pools; + kernel_split_params.buffer = buffer; + + split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state); + + } + + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + + KERNEL_NAME_EVAL(kernel, KERNEL_NAME)( + kg +#ifdef LOCALS_TYPE + , &locals +#endif + ); +} + +#undef KERNEL_NAME_JOIN +#undef KERNEL_NAME_EVAL + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl index 99b74a1802b..2b3be38df84 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_subsurface_scatter.h" -__kernel void kernel_ocl_path_trace_subsurface_scatter( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_subsurface_scatter((KernelGlobals*)kg); -} +#define KERNEL_NAME subsurface_scatter +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index 188c3960a5f..27a96720c1e 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -191,7 +191,7 @@ class PrincipledBSSRDFClosure : public CBSSRDFClosure { public: void setup(ShaderData *sd, int path_flag, float3 weight) { - alloc(sd, path_flag, weight * albedo, CLOSURE_BSSRDF_PRINCIPLED_ID); + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID); } }; diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index 5b66793a05d..14c5c1c3db5 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -156,7 +156,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction) BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused), + CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2), CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T), @@ -164,7 +164,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection) BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused), + CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N), CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1), CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2), CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T), @@ -191,7 +191,7 @@ BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen) class PrincipledClearcoatClosure : public CBSDFClosure { public: MicrofacetBsdf params; - float clearcoat, clearcoat_gloss; + float clearcoat, clearcoat_roughness; MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) { @@ -202,8 +202,8 @@ public: bsdf->ior = 1.5f; - bsdf->alpha_x = 0.1f * (1.0f - clearcoat_gloss) + 0.001f * clearcoat_gloss; - bsdf->alpha_y = 0.1f * (1.0f - clearcoat_gloss) + 0.001f * clearcoat_gloss; + bsdf->alpha_x = clearcoat_roughness; + bsdf->alpha_y = clearcoat_roughness; bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f); bsdf->extra->clearcoat = clearcoat; @@ -217,7 +217,7 @@ public: void setup(ShaderData *sd, int path_flag, float3 weight) { MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); - sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf) : 0; + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd) : 0; } }; @@ -226,7 +226,7 @@ ClosureParam *closure_bsdf_principled_clearcoat_params() static ClosureParam params[] = { CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N), CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat), - CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_gloss), + CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness), CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"), CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure) }; @@ -389,7 +389,7 @@ public: void setup(ShaderData *sd, int path_flag, float3 weight) { MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); - sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf) : 0; + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf, sd) : 0; } }; @@ -413,7 +413,7 @@ public: void setup(ShaderData *sd, int path_flag, float3 weight) { MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); - sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf) : 0; + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd) : 0; } }; @@ -566,7 +566,7 @@ public: void setup(ShaderData *sd, int path_flag, float3 weight) { MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); - sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf) : 0; + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd) : 0; } }; @@ -590,7 +590,7 @@ public: void setup(ShaderData *sd, int path_flag, float3 weight) { MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); - sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf) : 0; + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd) : 0; } }; @@ -618,7 +618,7 @@ public: void setup(ShaderData *sd, int path_flag, float3 weight) { MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); - sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf) : 0; + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd) : 0; } }; diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h index 02c083a83f8..9585d9f4825 100644 --- a/intern/cycles/kernel/osl/osl_globals.h +++ b/intern/cycles/kernel/osl/osl_globals.h @@ -86,7 +86,7 @@ struct OSLThreadData { OSL::ShaderGlobals globals; OSL::PerThreadInfo *osl_thread_info; OSLTraceData tracedata; - OSL::ShadingContext *context[SHADER_CONTEXT_NUM]; + OSL::ShadingContext *context; OIIO::TextureSystem::Perthread *oiio_thread_info; }; diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index b767c60c617..8ad2e12b067 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -824,7 +824,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val) { - if(sg->renderstate == NULL) + if(sg == NULL || sg->renderstate == NULL) return false; ShaderData *sd = (ShaderData *)(sg->renderstate); @@ -1197,8 +1197,9 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg, tracedata->init = true; tracedata->sd.osl_globals = sd->osl_globals; - /* raytrace */ - return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f); + /* Raytrace, leaving out shadow opaque to avoid early exit. */ + uint visibility = PATH_RAY_ALL_VISIBILITY - PATH_RAY_SHADOW_OPAQUE; + return scene_intersect(sd->osl_globals, ray, visibility, &tracedata->isect, NULL, 0.0f, 0.0f); } diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 13b19d86eca..9a37e0987aa 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -57,9 +57,7 @@ void OSLShader::thread_init(KernelGlobals *kg, KernelGlobals *kernel_globals, OS tdata->globals.tracedata = &tdata->tracedata; tdata->globals.flipHandedness = false; tdata->osl_thread_info = ss->create_thread_info(); - - for(int i = 0; i < SHADER_CONTEXT_NUM; i++) - tdata->context[i] = ss->get_context(tdata->osl_thread_info); + tdata->context = ss->get_context(tdata->osl_thread_info); tdata->oiio_thread_info = osl_globals->ts->get_perthread_info(); @@ -74,9 +72,7 @@ void OSLShader::thread_free(KernelGlobals *kg) OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSLThreadData *tdata = kg->osl_tdata; - - for(int i = 0; i < SHADER_CONTEXT_NUM; i++) - ss->release_context(tdata->context[i]); + ss->release_context(tdata->context); ss->destroy_thread_info(tdata->osl_thread_info); @@ -173,7 +169,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, } } -void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -182,7 +178,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state /* execute shader for this point */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; /* automatic bump shader */ @@ -274,7 +270,7 @@ static void flatten_background_closure_tree(ShaderData *sd, } } -void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -283,7 +279,7 @@ void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *st /* execute shader for this point */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; if(kg->osl->background_state) { ss->execute(octx, *(kg->osl->background_state), *globals); @@ -329,7 +325,7 @@ static void flatten_volume_closure_tree(ShaderData *sd, } } -void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -338,7 +334,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; if(kg->osl->volume_state[shader]) { @@ -352,7 +348,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, /* Displacement */ -void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx) +void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -364,7 +360,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderConte /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; if(kg->osl->displacement_state[shader]) { diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h index 32121e940b4..f7020d1223d 100644 --- a/intern/cycles/kernel/osl/osl_shader.h +++ b/intern/cycles/kernel/osl/osl_shader.h @@ -53,10 +53,10 @@ public: static void thread_free(KernelGlobals *kg); /* eval */ - static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx); + static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_displacement(KernelGlobals *kg, ShaderData *sd); /* attributes */ static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc); diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl index 57f40789d49..6870d479af3 100644 --- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl @@ -32,7 +32,7 @@ shader node_principled_bsdf( float Sheen = 0.0, float SheenTint = 0.5, float Clearcoat = 0.0, - float ClearcoatGloss = 1.0, + float ClearcoatRoughness = 0.03, float IOR = 1.45, float Transmission = 0.0, float TransmissionRoughness = 0.0, @@ -57,8 +57,8 @@ shader node_principled_bsdf( if (diffuse_weight > 1e-5) { if (Subsurface > 1e-5) { - color Albedo = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface); - BSDF = bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, Albedo, Roughness); + color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface); + BSDF = mixed_ss_base_color * bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, SubsurfaceColor, Roughness); } else { BSDF = BaseColor * principled_diffuse(Normal, Roughness); } @@ -76,8 +76,8 @@ shader node_principled_bsdf( float aspect = sqrt(1.0 - Anisotropic * 0.9); float r2 = Roughness * Roughness; - float alpha_x = max(0.001, r2 / aspect); - float alpha_y = max(0.001, r2 * aspect); + float alpha_x = r2 / aspect; + float alpha_y = r2 * aspect; color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint; @@ -114,7 +114,7 @@ shader node_principled_bsdf( } if (Clearcoat > 1e-5) { - BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatGloss); + BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness); } } diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h index 289d1091b0a..c91d2918687 100644 --- a/intern/cycles/kernel/shaders/stdosl.h +++ b/intern/cycles/kernel/shaders/stdosl.h @@ -546,7 +546,7 @@ closure color holdout() BUILTIN; closure color ambient_occlusion() BUILTIN; closure color principled_diffuse(normal N, float roughness) BUILTIN; closure color principled_sheen(normal N) BUILTIN; -closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_gloss) BUILTIN; +closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN; // BSSRDF closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN; diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h index dc74a2ada53..9fe4ec18e9e 100644 --- a/intern/cycles/kernel/split/kernel_branched.h +++ b/intern/cycles/kernel/split/kernel_branched.h @@ -63,17 +63,53 @@ ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobal REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT); } +ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + + int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index); + + if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) { + return false; + } + +#define SPLIT_DATA_ENTRY(type, name, num) \ + kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; + SPLIT_DATA_ENTRIES_BRANCHED_SHARED +#undef SPLIT_DATA_ENTRY + + kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0; + kernel_split_state.branched_state[inactive_ray].original_ray = ray_index; + kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray]; + + path_radiance_init(inactive_L, kernel_data.film.use_light_pass); + inactive_L->direct_throughput = L->direct_throughput; + path_radiance_copy_indirect(inactive_L, L); + + ray_state[inactive_ray] = RAY_REGENERATED; + ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED); + ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)); + + atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count); + + return true; +} + /* bounce off surface and integrate indirect light */ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg, int ray_index, float num_samples_adjust, ShaderData *saved_sd, - bool reset_path_state) + bool reset_path_state, + bool wait_for_shared) { SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; ShaderData *sd = saved_sd; - RNG rng = kernel_split_state.rng[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; float3 throughput = branched_state->throughput; ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; @@ -120,20 +156,20 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( num_samples = ceil_to_int(num_samples_adjust*num_samples); float num_samples_inv = num_samples_adjust/num_samples; - RNG bsdf_rng = cmj_hash(rng, i); for(int j = branched_state->next_sample; j < num_samples; j++) { if(reset_path_state) { *ps = branched_state->path_state; } + ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i); + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; *tp = throughput; ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index]; if(!kernel_branched_path_surface_bounce(kg, - &bsdf_rng, sd, sc, j, @@ -147,6 +183,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( continue; } + ps->rng_hash = branched_state->path_state.rng_hash; + /* update state for next iteration */ branched_state->next_closure = i; branched_state->next_sample = j+1; @@ -155,12 +193,25 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( /* start the indirect path */ *tp *= num_samples_inv; + if(kernel_split_branched_indirect_start_shared(kg, ray_index)) { + continue; + } + return true; } branched_state->next_sample = 0; } + branched_state->next_closure = sd->num_closure; + + if(wait_for_shared) { + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + } + return false; } diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index 4c1fdd2d69c..3b61319e349 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -75,57 +75,34 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, if(ray_index != QUEUE_EMPTY_SLOT) { #endif - ccl_global uint *rng_state = kernel_split_params.rng_state; int stride = kernel_split_params.stride; ccl_global char *ray_state = kernel_split_state.ray_state; -#ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &kernel_split_state.debug_data[ray_index]; -#endif ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; - ccl_global float *buffer = kernel_split_params.buffer; - - unsigned int work_index; - ccl_global uint *initial_rng; - - unsigned int sample; - unsigned int tile_x; - unsigned int tile_y; - unsigned int pixel_x; - unsigned int pixel_y; - - work_index = kernel_split_state.work_array[ray_index]; - sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, - &tile_x, &tile_y, - work_index, - ray_index); - initial_rng = rng_state; - - rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride; - buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, state, debug_data, sample); -#endif + uint work_index = kernel_split_state.work_array[ray_index]; + uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + + uint tile_x, tile_y, pixel_x, pixel_y; + get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index); + + ccl_global float *buffer = kernel_split_params.buffer; + buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; /* accumulate result in output buffer */ bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER); - kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher); - - path_rng_end(kg, rng_state, rng); + kernel_write_result(kg, buffer, sample, L, is_shadow_catcher); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); } if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { /* We have completed current work; So get next work */ + uint work_index; int valid_work = get_next_work(kg, &work_index, ray_index); if(!valid_work) { /* If work is invalid, this means no more work is available and the thread may exit */ @@ -135,32 +112,33 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { kernel_split_state.work_array[ray_index] = work_index; /* Get the sample associated with the current work */ - sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; /* Get pixel and tile position associated with current work */ + uint tile_x, tile_y, pixel_x, pixel_y; get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index); /* Remap rng_state according to the current work */ - rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride; + ccl_global uint *rng_state = kernel_split_params.rng_state; + rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride; + /* Remap buffer according to the current work */ + ccl_global float *buffer = kernel_split_params.buffer; buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; /* Initialize random numbers and ray. */ - kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng, ray); + uint rng_hash; + kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng_hash, ray); if(ray->t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; + /* Initialize throughput, path radiance, Ray, PathState; * These rays proceed with path-iteration. */ *throughput = make_float3(1.0f, 1.0f, 1.0f); - *L_transparent = 0.0f; path_radiance_init(L, kernel_data.film.use_light_pass); - path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &rng, sample, ray); + path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng_hash, sample, ray); #ifdef __SUBSURFACE__ kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); #endif -#ifdef __KERNEL_DEBUG__ - debug_data_init(debug_data); -#endif ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); enqueue_flag = 1; } @@ -169,13 +147,11 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ kernel_write_pass_float4(buffer, sample, L_rad); - path_rng_end(kg, rng_state, rng); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); } } } - kernel_split_state.rng[ray_index] = rng; #ifndef __COMPUTE_DEVICE_GPU__ } diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index e4545d66eff..2c042dfde6f 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -52,9 +52,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( ccl_global uint *rng_state, #ifdef __KERNEL_OPENCL__ -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, #endif int start_sample, @@ -100,9 +98,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state); #ifdef __KERNEL_OPENCL__ -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); #endif int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); @@ -127,14 +124,25 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( /* zero the tiles pixels and initialize rng_state if this is the first sample */ if(start_sample == 0) { - parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) { - int pixel = i / kernel_data.film.pass_stride; - int pass = i % kernel_data.film.pass_stride; + int pass_stride = kernel_data.film.pass_stride; + +#ifdef __KERNEL_CPU__ + for(int y = sy; y < sy + sh; y++) { + int index = offset + y * stride; + memset(buffer + (sx + index) * pass_stride, 0, sizeof(float) * pass_stride * sw); + for(int x = sx; x < sx + sw; x++) { + rng_state[index + x] = hash_int_2d(x, y); + } + } +#else + parallel_for(kg, i, sw * sh * pass_stride) { + int pixel = i / pass_stride; + int pass = i % pass_stride; int x = sx + pixel % sw; int y = sy + pixel / sw; - int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass; + int index = (offset + x + y*stride) * pass_stride + pass; *(buffer + index) = 0.0f; } @@ -146,6 +154,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( int index = (offset + x + y*stride); *(rng_state + index) = hash_int_2d(x, y); } +#endif } #endif /* KERENL_STUB */ diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index 3336c968a44..8e3f7555550 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -62,8 +62,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, /* direct lighting */ #ifdef __EMISSION__ - RNG rng = kernel_split_state.rng[ray_index]; - bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)); @@ -83,10 +81,10 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, if(flag) { /* Sample illumination from lights to find path contribution. */ - float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT); + float light_t = path_state_rng_1D(kg, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, &rng, state, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_state_rng_light_termination(kg, &rng, state); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_state_rng_light_termination(kg, state); LightSample ls; if(light_sample(kg, @@ -115,7 +113,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, } } } - kernel_split_state.rng[ray_index] = rng; #endif /* __EMISSION__ */ } diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h index 694b777f429..478d83d633e 100644 --- a/intern/cycles/kernel/split/kernel_do_volume.h +++ b/intern/cycles/kernel/split/kernel_do_volume.h @@ -30,7 +30,6 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; @@ -58,15 +57,15 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous); + kg, ps, sd, &volume_ray, L, tp, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L); + kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L); /* indirect light bounce */ - if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) { + if(!kernel_path_volume_bounce(kg, sd, tp, ps, L, pray)) { continue; } @@ -75,11 +74,30 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K branched_state->next_sample = j+1; branched_state->num_samples = num_samples; + /* Attempting to share too many samples is slow for volumes as it causes us to + * loop here more and have many calls to kernel_volume_integrate which evaluates + * shaders. The many expensive shader evaluations cause the work load to become + * unbalanced and many threads to become idle in this kernel. Limiting the + * number of shared samples here helps quite a lot. + */ + if(branched_state->shared_sample_count < 2) { + if(kernel_split_branched_indirect_start_shared(kg, ray_index)) { + continue; + } + } + return true; } # endif } + branched_state->next_sample = num_samples; + + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + kernel_split_branched_path_indirect_loop_end(kg, ray_index); /* todo: avoid this calculation using decoupled ray marching */ @@ -122,7 +140,6 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; @@ -146,15 +163,15 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) { /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous); + kg, state, sd, &volume_ray, L, throughput, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L); + kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L); /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) { + if(kernel_path_volume_bounce(kg, sd, throughput, state, L, ray)) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); } else { @@ -175,8 +192,6 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) } # endif /* __BRANCHED_PATH__ */ } - - kernel_split_state.rng[ray_index] = rng; } # ifdef __BRANCHED_PATH__ diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h new file mode 100644 index 00000000000..496355bbc3a --- /dev/null +++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h @@ -0,0 +1,46 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_enqueue_inactive(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ +#ifdef __BRANCHED_PATH__ + /* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + char enqueue_flag = 0; + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) { + enqueue_flag = 1; + } + + enqueue_ray_index_local(ray_index, + QUEUE_INACTIVE_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif /* __BRANCHED_PATH__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 670a557f084..253b78526e7 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -92,29 +92,19 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( int stride = kernel_split_params.stride; - unsigned int work_index; - unsigned int pixel_x; - unsigned int pixel_y; - - unsigned int tile_x; - unsigned int tile_y; - unsigned int sample; - - RNG rng = kernel_split_state.rng[ray_index]; ccl_global PathState *state = 0x0; float3 throughput; + uint sample; ccl_global char *ray_state = kernel_split_state.ray_state; ShaderData *sd = &kernel_split_state.sd[ray_index]; ccl_global float *buffer = kernel_split_params.buffer; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - - throughput = kernel_split_state.throughput[ray_index]; - state = &kernel_split_state.path_state[ray_index]; - - work_index = kernel_split_state.work_array[ray_index]; + uint work_index = kernel_split_state.work_array[ray_index]; sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + + uint pixel_x, pixel_y, tile_x, tile_y; get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, @@ -122,20 +112,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride; + throughput = kernel_split_state.throughput[ray_index]; + state = &kernel_split_state.path_state[ray_index]; + #ifdef __SHADOW_TRICKS__ if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state->flag & PATH_RAY_CAMERA) { - state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); - state->catcher_object = sd->object; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + state->flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_STORE_SHADOW_INFO); if(!kernel_data.background.transparent) { - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); + L->shadow_background_color = indirect_background( + kg, + &kernel_split_state.sd_DL_shadow[ray_index], + state, + ray); } + L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L); + L->shadow_throughput = average(throughput); } } - else { - state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; + else if(state->flag & PATH_RAY_SHADOW_CATCHER) { + /* Only update transparency after shadow catcher bounce. */ + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + L->shadow_transparency *= average(shader_bsdf_transparency(kg, sd)); } #endif /* __SHADOW_TRICKS__ */ @@ -154,7 +155,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( holdout_weight = shader_holdout_eval(kg, sd); } /* any throughput is ok, should all be identical here */ - kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput); + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + L->transparent += average(holdout_weight*throughput); } if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { kernel_split_path_end(kg, ray_index); @@ -216,19 +218,19 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( * shader evaluations, only need emission if we are going to terminate. */ #ifndef __BRANCHED_PATH__ - float probability = path_state_terminate_probability(kg, state, throughput); + float probability = path_state_continuation_probability(kg, state, throughput); #else float probability = 1.0f; if(!kernel_data.integrator.branched) { - probability = path_state_terminate_probability(kg, state, throughput); + probability = path_state_continuation_probability(kg, state, throughput); } else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { int num_samples = kernel_split_state.branched_state[ray_index].num_samples; - probability = path_state_terminate_probability(kg, state, throughput*num_samples); + probability = path_state_continuation_probability(kg, state, throughput*num_samples); } else if(state->flag & PATH_RAY_TRANSPARENT) { - probability = path_state_terminate_probability(kg, state, throughput); + probability = path_state_continuation_probability(kg, state, throughput); } #endif @@ -238,7 +240,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, state, PRNG_TERMINATE); if(terminate >= probability) { kernel_split_path_end(kg, ray_index); } @@ -260,8 +262,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( } #endif /* __AO__ */ - kernel_split_state.rng[ray_index] = rng; - #ifndef __COMPUTE_DEVICE_GPU__ } #endif diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h index f0ebb90f60a..04d5769ef0d 100644 --- a/intern/cycles/kernel/split/kernel_indirect_background.h +++ b/intern/cycles/kernel/split/kernel_indirect_background.h @@ -54,12 +54,11 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index]; if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { /* eval background shader if nothing hit */ if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { - *L_transparent = (*L_transparent) + average((*throughput)); + L->transparent += average((*throughput)); #ifdef __PASSES__ if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) #endif diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 71017fed19e..4e0c966cca9 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -126,7 +126,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, if(active) { ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; @@ -135,7 +134,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { #endif /* Compute direct lighting and next bounce. */ - if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) { + if(!kernel_path_surface_bounce(kg, sd, throughput, state, L, ray)) { kernel_split_path_end(kg, ray_index); } #ifdef __BRANCHED_PATH__ @@ -147,6 +146,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, ray_index, 1.0f, &kernel_split_state.branched_state[ray_index].sd, + true, true)) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); @@ -156,8 +156,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, } } #endif /* __BRANCHED_PATH__ */ - - kernel_split_state.rng[ray_index] = rng; } /* Enqueue RAY_UPDATE_BUFFER rays. */ @@ -193,6 +191,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, ray_index, 1.0f, &kernel_split_state.branched_state[ray_index].sd, + true, true)) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h index a7ecde7c80d..c75931855b2 100644 --- a/intern/cycles/kernel/split/kernel_path_init.h +++ b/intern/cycles/kernel/split/kernel_path_init.h @@ -29,13 +29,7 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { */ kernel_split_state.ray_state[ray_index] = RAY_ACTIVE; - unsigned int my_sample; - unsigned int pixel_x; - unsigned int pixel_y; - unsigned int tile_x; - unsigned int tile_y; - - unsigned int work_index = 0; + uint work_index = 0; /* Get work. */ if(!get_next_work(kg, &work_index, ray_index)) { /* No more work, mark ray as inactive */ @@ -45,9 +39,10 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { } /* Get the sample associated with the work. */ - my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; /* Get pixel and tile position associated with the work. */ + uint pixel_x, pixel_y, tile_x, tile_y; get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, @@ -60,46 +55,38 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { ccl_global float *buffer = kernel_split_params.buffer; buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride; - RNG rng = kernel_split_state.rng[ray_index]; - /* Initialize random numbers and ray. */ + uint rng_hash; kernel_path_trace_setup(kg, rng_state, - my_sample, + sample, pixel_x, pixel_y, - &rng, + &rng_hash, &kernel_split_state.ray[ray_index]); if(kernel_split_state.ray[ray_index].t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; + /* Initialize throughput, path radiance, Ray, PathState; * These rays proceed with path-iteration. */ kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f); - kernel_split_state.L_transparent[ray_index] = 0.0f; path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass); path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], &kernel_split_state.path_state[ray_index], - &rng, - my_sample, + rng_hash, + sample, &kernel_split_state.ray[ray_index]); #ifdef __SUBSURFACE__ kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); #endif - -#ifdef __KERNEL_DEBUG__ - debug_data_init(&kernel_split_state.debug_data[ray_index]); -#endif } else { /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ - kernel_write_pass_float4(buffer, my_sample, L_rad); - path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]); + kernel_write_pass_float4(buffer, sample, L_rad); ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); } - kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h index e2e841f36d3..66ce2dfb6f1 100644 --- a/intern/cycles/kernel/split/kernel_queue_enqueue.h +++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h @@ -51,7 +51,8 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg, int queue_number = -1; if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) { + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) { queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; } else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h index 5dc94caec85..d0afd39ef29 100644 --- a/intern/cycles/kernel/split/kernel_scene_intersect.h +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -43,15 +43,22 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg) } /* All regenerated rays become active here */ - if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) { +#ifdef __BRANCHED_PATH__ + if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) { + kernel_split_path_end(kg, ray_index); + } + else +#endif /* __BRANCHED_PATH__ */ + { + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + } + } - if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) + if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { return; + } -#ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &kernel_split_state.debug_data[ray_index]; -#endif Intersection isect; PathState state = kernel_split_state.path_state[ray_index]; Ray ray = kernel_split_state.ray[ray_index]; @@ -67,7 +74,6 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg) #ifdef __HAIR__ float difl = 0.0f, extmax = 0.0f; uint lcg_state = 0; - RNG rng = kernel_split_state.rng[ray_index]; if(kernel_data.bvh.have_curves) { if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { @@ -77,7 +83,7 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg) } extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(&rng, state.rng_offset, state.sample, 0x51633e2d); + lcg_state = lcg_state_init(&state, 0x51633e2d); } bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); @@ -87,12 +93,14 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg) kernel_split_state.isect[ray_index] = isect; #ifdef __KERNEL_DEBUG__ + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + if(state.flag & PATH_RAY_CAMERA) { - debug_data->num_bvh_traversed_nodes += isect.num_traversed_nodes; - debug_data->num_bvh_traversed_instances += isect.num_traversed_instances; - debug_data->num_bvh_intersections += isect.num_intersections; + L->debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes; + L->debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; + L->debug_data.num_bvh_intersections += isect.num_intersections; } - debug_data->num_ray_bounces++; + L->debug_data.num_ray_bounces++; #endif if(!hit) { diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index 2801b32f285..eac29dcd0d1 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -48,30 +48,22 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg) ccl_global char *ray_state = kernel_split_state.ray_state; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - RNG rng = kernel_split_state.rng[ray_index]; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; #ifndef __BRANCHED_PATH__ - float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); - shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); + float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF); + shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, rbsdf, state->flag); #else - ShaderContext ctx = SHADER_CONTEXT_MAIN; float rbsdf = 0.0f; if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { - rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); + rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF); } - if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { - ctx = SHADER_CONTEXT_INDIRECT; - } - - shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx); + shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, rbsdf, state->flag); shader_merge_closures(&kernel_split_state.sd[ray_index]); #endif /* __BRANCHED_PATH__ */ - - kernel_split_state.rng[ray_index] = rng; } } diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h index 297decb0bc2..5a55b680695 100644 --- a/intern/cycles/kernel/split/kernel_shader_sort.h +++ b/intern/cycles/kernel/split/kernel_shader_sort.h @@ -39,7 +39,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, ccl_local ushort *local_index = &locals->local_index[0]; /* copy to local memory */ - for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { uint idx = offset + i + lid; uint add = input + idx; uint value = (~0); @@ -59,9 +59,9 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, # ifdef __KERNEL_OPENCL__ /* bitonic sort */ - for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { - for (uint inc = length; inc > 0; inc >>= 1) { - for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { + for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { + for(uint inc = length; inc > 0; inc >>= 1) { + for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { uint i = lid + ii; bool direction = ((i & (length << 1)) != 0); uint j = i ^ inc; @@ -81,7 +81,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, # endif /* __KERNEL_OPENCL__ */ /* copy to destination */ - for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { uint idx = offset + i + lid; uint lidx = local_index[i + lid]; uint outi = output + idx; diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h index 474286285a9..79aa2c9435b 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h @@ -37,21 +37,18 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg) ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; float3 throughput = kernel_split_state.throughput[ray_index]; #ifdef __BRANCHED_PATH__ if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { #endif - kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd)); + kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd)); #ifdef __BRANCHED_PATH__ } else { - kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput); + kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput); } #endif - - kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h index 386fbbc4d09..b52f9a5eb81 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h @@ -29,6 +29,14 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) kernel_split_state.queue_data, kernel_split_params.queue_size, 1); } +#ifdef __BRANCHED_PATH__ + /* TODO(mai): move this somewhere else? */ + if(thread_index == 0) { + /* Clear QUEUE_INACTIVE_RAYS before next kernel. */ + kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0; + } +#endif /* __BRANCHED_PATH__ */ + if(ray_index == QUEUE_EMPTY_SLOT) return; @@ -37,7 +45,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; float3 throughput = kernel_split_state.throughput[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; @@ -67,7 +74,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) if(use_branched) { kernel_branched_path_surface_connect_light(kg, - &rng, sd, emission_sd, state, @@ -83,10 +89,11 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) float3 shadow; if(!shadow_blocked(kg, - emission_sd, - state, - &ray, - &shadow)) + sd, + emission_sd, + state, + &ray, + &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); @@ -95,8 +102,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) path_radiance_accum_total_light(L, state, throughput, &L_light); } } - - kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index 57f070d51e0..08f0124b529 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -56,7 +56,20 @@ ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index) ccl_global char *ray_state = kernel_split_state.ray_state; #ifdef __BRANCHED_PATH__ - if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) { + if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) { + int orig_ray = kernel_split_state.branched_state[ray_index].original_ray; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray]; + + path_radiance_sum_indirect(L); + path_radiance_accum_sample(orig_ray_L, L, 1); + + atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER); } else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) { diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h index bb1aca2acbf..3eae884d479 100644 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -56,14 +56,6 @@ typedef struct SplitParams { /* SPLIT_DATA_ENTRY(type, name, num) */ -#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__) -/* DebugData memory */ -# define SPLIT_DATA_DEBUG_ENTRIES \ - SPLIT_DATA_ENTRY(DebugData, debug_data, 1) -#else -# define SPLIT_DATA_DEBUG_ENTRIES -#endif /* DEBUG */ - #ifdef __BRANCHED_PATH__ typedef ccl_global struct SplitBranchedState { @@ -95,6 +87,10 @@ typedef ccl_global struct SplitBranchedState { VolumeStack volume_stack[VOLUME_STACK_SIZE]; # endif /* __VOLUME__ */ #endif /*__SUBSURFACE__ */ + + int shared_sample_count; /* number of branched samples shared with other threads */ + int original_ray; /* index of original ray when sharing branched samples */ + bool waiting_on_shared_samples; } SplitBranchedState; #define SPLIT_DATA_BRANCHED_ENTRIES \ @@ -118,9 +114,7 @@ typedef ccl_global struct SplitBranchedState { #endif /* __VOLUME__ */ #define SPLIT_DATA_ENTRIES \ - SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \ SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ - SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \ SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ @@ -135,7 +129,22 @@ typedef ccl_global struct SplitBranchedState { SPLIT_DATA_SUBSURFACE_ENTRIES \ SPLIT_DATA_VOLUME_ENTRIES \ SPLIT_DATA_BRANCHED_ENTRIES \ - SPLIT_DATA_DEBUG_ENTRIES \ + +/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */ +#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \ + SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ + SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ + SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ + SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ + SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_BRANCHED_ENTRIES \ /* struct that holds pointers to data in the shared state buffer */ typedef struct SplitData { diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h index 1dffe1b179e..a487e53df5c 100644 --- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h +++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h @@ -38,7 +38,6 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; ShaderData *sd = &branched_state->sd; - RNG rng = kernel_split_state.rng[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; @@ -52,14 +51,12 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) { - branched_state->lcg_state = lcg_state_init(&rng, - branched_state->path_state.rng_offset, - branched_state->path_state.sample, - 0x68bc21eb); + branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state, + 0x68bc21eb); } int num_samples = kernel_data.integrator.subsurface_samples; float num_samples_inv = 1.0f/num_samples; - RNG bssrdf_rng = cmj_hash(rng, i); + uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i); /* do subsurface scatter step with copy of shader data, this will * replace the BSSRDF with a diffuse BSDF closure */ @@ -67,7 +64,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect; float bssrdf_u, bssrdf_v; path_branched_rng_2D(kg, - &bssrdf_rng, + bssrdf_rng_hash, &branched_state->path_state, j, num_samples, @@ -77,7 +74,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it /* intersection is expensive so avoid doing multiple times for the same input */ if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) { - RNG lcg_state = branched_state->lcg_state; + uint lcg_state = branched_state->lcg_state; SubsurfaceIntersection ss_isect_private; branched_state->num_hits = subsurface_scatter_multi_intersect(kg, @@ -152,7 +149,6 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it int all = (kernel_data.integrator.sample_all_lights_direct) || (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, - &rng, bssrdf_sd, emission_sd, hit_state, @@ -169,6 +165,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it ray_index, num_samples_inv, bssrdf_sd, + false, false)) { branched_state->ss_next_closure = i; @@ -187,6 +184,13 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it branched_state->ss_next_sample = 0; } + branched_state->ss_next_closure = sd->num_closure; + + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + kernel_split_branched_path_indirect_loop_end(kg, ray_index); return false; @@ -221,7 +225,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; @@ -238,7 +241,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) emission_sd, L, state, - &rng, ray, throughput, ss_indirect)) @@ -256,22 +258,20 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) /* do bssrdf scatter step if we picked a bssrdf closure */ if(sc) { - uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb); - + uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb); float bssrdf_u, bssrdf_v; path_state_rng_2D(kg, - &rng, - state, - PRNG_BSDF_U, - &bssrdf_u, &bssrdf_v); + state, + PRNG_BSDF_U, + &bssrdf_u, &bssrdf_v); subsurface_scatter_step(kg, - sd, - state, - state->flag, - sc, - &lcg_state, - bssrdf_u, bssrdf_v, - false); + sd, + state, + state->flag, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + false); } } else { @@ -283,7 +283,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) } #endif } - kernel_split_state.rng[ray_index] = rng; } # ifdef __BRANCHED_PATH__ diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index f04f765686e..4268813b263 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -79,13 +79,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #ifdef __PRINCIPLED__ case CLOSURE_BSDF_PRINCIPLED_ID: { uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset, - sheen_tint_offset, clearcoat_offset, clearcoat_gloss_offset, eta_offset, transmission_offset, + sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset, anisotropic_rotation_offset, transmission_roughness_offset; uint4 data_node2 = read_node(kg, offset); float3 T = stack_load_float3(stack, data_node.y); decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset); - decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_gloss_offset); + decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset); decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset); // get Disney principled parameters @@ -98,7 +98,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float sheen = stack_load_float(stack, sheen_offset); float sheen_tint = stack_load_float(stack, sheen_tint_offset); float clearcoat = stack_load_float(stack, clearcoat_offset); - float clearcoat_gloss = stack_load_float(stack, clearcoat_gloss_offset); + float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset); float transmission = stack_load_float(stack, transmission_offset); float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset); float transmission_roughness = stack_load_float(stack, transmission_roughness_offset); @@ -141,8 +141,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float3 weight = sd->svm_closure_weight * mix_weight; #ifdef __SUBSURFACE__ - float3 albedo = subsurface_color * subsurface + base_color * (1.0f - subsurface); - float3 subsurf_weight = weight * albedo * diffuse_weight; + float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface); + float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight; float subsurf_sample_weight = fabsf(average(subsurf_weight)); /* disable in case of diffuse ancestor, can't see it well then and @@ -154,12 +154,12 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* need to set the base color in this case such that the * rays get the correctly mixed color after transmitting * the object */ - base_color = albedo; + base_color = mixed_ss_base_color; } /* diffuse */ - if(fabsf(average(base_color)) > CLOSURE_WEIGHT_CUTOFF) { - if(subsurface < CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) { + if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) { + if(subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) { float3 diff_weight = weight * base_color * diffuse_weight; PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight); @@ -186,7 +186,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->sample_weight = subsurf_sample_weight; bssrdf->radius = radius.x; bssrdf->texture_blur = texture_blur; - bssrdf->albedo = albedo.x; + bssrdf->albedo = subsurface_color.x; bssrdf->sharpness = sharpness; bssrdf->N = N; bssrdf->roughness = roughness; @@ -200,7 +200,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->sample_weight = subsurf_sample_weight; bssrdf->radius = radius.y; bssrdf->texture_blur = texture_blur; - bssrdf->albedo = albedo.y; + bssrdf->albedo = subsurface_color.y; bssrdf->sharpness = sharpness; bssrdf->N = N; bssrdf->roughness = roughness; @@ -214,7 +214,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->sample_weight = subsurf_sample_weight; bssrdf->radius = radius.z; bssrdf->texture_blur = texture_blur; - bssrdf->albedo = albedo.z; + bssrdf->albedo = subsurface_color.z; bssrdf->sharpness = sharpness; bssrdf->N = N; bssrdf->roughness = roughness; @@ -280,8 +280,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f); float r2 = roughness * roughness; - bsdf->alpha_x = fmaxf(0.001f, r2 / aspect); - bsdf->alpha_y = fmaxf(0.001f, r2 * aspect); + bsdf->alpha_x = r2 / aspect; + bsdf->alpha_y = r2 * aspect; float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx. float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat @@ -292,9 +292,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* setup bsdf */ if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */ - sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd); else /* use multi-scatter GGX */ - sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd); } } #ifdef __CAUSTICS_TRICKS__ @@ -332,7 +332,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->extra->cspec0 = cspec0; /* setup bsdf */ - sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd); } } @@ -377,7 +377,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->extra->cspec0 = cspec0; /* setup bsdf */ - sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd); } } } @@ -398,14 +398,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->ior = 1.5f; bsdf->extra = extra; - bsdf->alpha_x = 0.1f * (1.0f - clearcoat_gloss) + 0.001f * clearcoat_gloss; - bsdf->alpha_y = 0.1f * (1.0f - clearcoat_gloss) + 0.001f * clearcoat_gloss; + bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness; + bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness; bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f); bsdf->extra->clearcoat = clearcoat; /* setup bsdf */ - sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd); } } #ifdef __CAUSTICS_TRICKS__ @@ -725,6 +725,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight); if(bsdf) { + bsdf->N = N; bsdf->roughness1 = param1; bsdf->roughness2 = param2; bsdf->offset = -stack_load_float(stack, data_node.z); diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 8e45dbfa5ff..6d6e92e73f6 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -16,19 +16,6 @@ CCL_NAMESPACE_BEGIN -/* Float4 textures on various devices. */ -#if defined(__KERNEL_CPU__) -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU -#elif defined(__KERNEL_CUDA__) -# if __CUDA_ARCH__ < 300 -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA -# else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA_KEPLER -# endif -#else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_OPENCL -#endif - ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha) { #ifdef __KERNEL_CPU__ @@ -50,94 +37,94 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, switch(id) { case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break; - case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break; - case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break; - case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break; - case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break; - case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break; - case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break; - case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break; - case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break; + case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break; + case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break; + case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break; + case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break; + case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break; case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break; - case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break; - case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break; - case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break; - case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break; - case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break; - case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break; - case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break; case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break; - case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break; - case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break; - case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break; - case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break; - case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break; - case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break; - case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break; case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break; - case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break; - case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break; - case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break; - case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break; - case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break; - case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break; - case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break; case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break; - case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break; - case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break; - case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break; - case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break; - case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break; - case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break; - case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break; case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break; - case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break; - case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break; - case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break; - case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break; - case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break; - case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break; - case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break; case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break; - case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break; - case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break; - case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break; - case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break; - case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break; - case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break; - case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break; case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break; - case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break; - case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break; - case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break; - case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break; - case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break; - case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break; - case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break; case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break; - case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break; - case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break; - case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break; - case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break; - case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break; - case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break; - case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break; case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break; - case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break; - case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break; - case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break; - case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break; - case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break; - case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break; - case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break; case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break; - case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break; - case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break; - case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break; - case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break; - case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break; - case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break; - case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break; + case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break; + case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break; + case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break; + case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break; + case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break; + case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break; + case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break; + case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break; + case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break; + case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break; + case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break; + case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break; + case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break; + case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break; + case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break; + case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break; + case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break; + case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break; + case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break; + case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break; + case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break; + case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break; + case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break; + case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break; + case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break; + case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break; + case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break; + case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break; + case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break; + case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break; + case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break; + case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break; + case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break; + case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break; + case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break; + case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break; + case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break; + case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break; + case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break; + case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break; + case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break; + case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break; + case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break; + case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break; + case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break; + case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break; + case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break; + case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break; + case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break; + case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break; + case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break; + case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break; + case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break; + case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break; + case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break; + case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break; + case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break; + case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break; + case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break; + case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break; + case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break; + case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break; + case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break; + case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break; + case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break; + case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break; + case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break; + case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break; + case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break; + case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break; + case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break; + case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break; + case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break; default: kernel_assert(0); return make_float4(0.0f, 0.0f, 0.0f, 0.0f); @@ -224,6 +211,8 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float object_inverse_normal_transform(kg, sd, &N); /* project from direction vector to barycentric coordinates in triangles */ + float3 signed_N = N; + N.x = fabsf(N.x); N.y = fabsf(N.y); N.z = fabsf(N.z); @@ -293,12 +282,19 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f); uint use_alpha = stack_valid(alpha_offset); - if(weight.x > 0.0f) - f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb, use_alpha); - if(weight.y > 0.0f) - f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb, use_alpha); - if(weight.z > 0.0f) - f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb, use_alpha); + /* Map so that no textures are flipped, rotation is somewhat arbitrary. */ + if(weight.x > 0.0f) { + float2 uv = make_float2((signed_N.x < 0.0f)? 1.0f - co.y: co.y, co.z); + f += weight.x*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } + if(weight.y > 0.0f) { + float2 uv = make_float2((signed_N.y > 0.0f)? 1.0f - co.x: co.x, co.z); + f += weight.y*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } + if(weight.z > 0.0f) { + float2 uv = make_float2((signed_N.z > 0.0f)? 1.0f - co.y: co.y, co.x); + f += weight.z*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } if(stack_valid(out_offset)) stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z)); |