From 84470a1190b28cd37491e5002aea4695e4f26f44 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Sat, 29 Mar 2014 13:03:45 +0100 Subject: Cycles code refactor: move geometry related kernel files into own directory. --- intern/cycles/kernel/CMakeLists.txt | 20 +- intern/cycles/kernel/SConscript | 5 +- intern/cycles/kernel/geom/geom_bvh.h | 1322 +++++++++++++++++++++++ intern/cycles/kernel/geom/geom_bvh_subsurface.h | 294 +++++ intern/cycles/kernel/geom/geom_bvh_traversal.h | 354 ++++++ intern/cycles/kernel/geom/geom_curve.h | 137 +++ intern/cycles/kernel/geom/geom_object.h | 300 +++++ intern/cycles/kernel/geom/geom_triangle.h | 180 +++ intern/cycles/kernel/kernel_bvh.h | 1318 ---------------------- intern/cycles/kernel/kernel_bvh_subsurface.h | 294 ----- intern/cycles/kernel/kernel_bvh_traversal.h | 354 ------ intern/cycles/kernel/kernel_curve.h | 137 --- intern/cycles/kernel/kernel_object.h | 300 ----- intern/cycles/kernel/kernel_path.h | 9 +- intern/cycles/kernel/kernel_triangle.h | 180 --- intern/cycles/kernel/osl/osl_services.cpp | 9 +- intern/cycles/kernel/osl/osl_shader.cpp | 3 +- 17 files changed, 2612 insertions(+), 2604 deletions(-) create mode 100644 intern/cycles/kernel/geom/geom_bvh.h create mode 100644 intern/cycles/kernel/geom/geom_bvh_subsurface.h create mode 100644 intern/cycles/kernel/geom/geom_bvh_traversal.h create mode 100644 intern/cycles/kernel/geom/geom_curve.h create mode 100644 intern/cycles/kernel/geom/geom_object.h create mode 100644 intern/cycles/kernel/geom/geom_triangle.h delete mode 100644 intern/cycles/kernel/kernel_bvh.h delete mode 100644 intern/cycles/kernel/kernel_bvh_subsurface.h delete mode 100644 intern/cycles/kernel/kernel_bvh_traversal.h delete mode 100644 intern/cycles/kernel/kernel_curve.h delete mode 100644 intern/cycles/kernel/kernel_object.h delete mode 100644 intern/cycles/kernel/kernel_triangle.h (limited to 'intern/cycles/kernel') diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index ebeebe20c0f..ccefb314894 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -19,14 +19,10 @@ set(SRC set(SRC_HEADERS kernel.h kernel_accumulate.h - kernel_bvh.h - kernel_bvh_subsurface.h - kernel_bvh_traversal.h kernel_camera.h kernel_compat_cpu.h kernel_compat_cuda.h kernel_compat_opencl.h - kernel_curve.h kernel_differential.h kernel_displace.h kernel_emission.h @@ -36,7 +32,6 @@ set(SRC_HEADERS kernel_light.h kernel_math.h kernel_montecarlo.h - kernel_object.h kernel_passes.h kernel_path.h kernel_path_state.h @@ -47,7 +42,6 @@ set(SRC_HEADERS kernel_shadow.h kernel_subsurface.h kernel_textures.h - kernel_triangle.h kernel_types.h kernel_volume.h ) @@ -114,6 +108,15 @@ set(SRC_SVM_HEADERS svm/svm_wave.h ) +set(SRC_GEOM_HEADERS + geom/geom_bvh.h + geom/geom_bvh_subsurface.h + geom/geom_bvh_traversal.h + geom/geom_curve.h + geom/geom_object.h + geom/geom_triangle.h +) + set(SRC_UTIL_HEADERS ../util/util_color.h ../util/util_half.h @@ -146,7 +149,7 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() # build for each arch - set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS}) + set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS}) set(cuda_cubins) foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) @@ -222,7 +225,7 @@ if(CXX_HAS_SSE) endif() -add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS}) +add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS}) if(WITH_CYCLES_CUDA) add_dependencies(cycles_kernel cycles_kernel_cuda) @@ -243,5 +246,6 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/k delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript index 5077d8c96b0..b2eafe6a83d 100644 --- a/intern/cycles/kernel/SConscript +++ b/intern/cycles/kernel/SConscript @@ -60,6 +60,7 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']: kernel_file = os.path.join(source_dir, "kernel.cu") util_dir = os.path.join(source_dir, "../util") svm_dir = os.path.join(source_dir, "../svm") + geom_dir = os.path.join(source_dir, "../geom") closure_dir = os.path.join(source_dir, "../closure") # get CUDA version @@ -76,10 +77,10 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']: nvcc_flags += " --cubin --ptxas-options=\"-v\"" nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version) nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC" - nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, closure_dir) + nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, geom_dir, closure_dir) # dependencies - dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('closure/*.h') + dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h') last_cubin_file = None # add command for each cuda architecture diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h new file mode 100644 index 00000000000..0272dff5115 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_bvh.h @@ -0,0 +1,1322 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation + * Modifications Copyright 2011, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * "Persistent while-while kernel" used in: + * + * "Understanding the Efficiency of Ray Traversal on GPUs", + * Timo Aila and Samuli Laine, + * Proc. High-Performance Graphics 2009 + */ + +/* bottom-most stack entry, indicating the end of traversal */ +#define ENTRYPOINT_SENTINEL 0x76543210 + +/* 64 object BVH + 64 mesh BVH + 64 object node splitting */ +#define BVH_STACK_SIZE 192 +#define BVH_NODE_SIZE 4 +#define TRI_NODE_SIZE 3 + +/* silly workaround for float extended precision that happens when compiling + * without sse support on x86, it results in different results for float ops + * that you would otherwise expect to compare correctly */ +#if !defined(__i386__) || defined(__SSE__) +#define NO_EXTENDED_PRECISION +#else +#define NO_EXTENDED_PRECISION volatile +#endif + +#include "geom_object.h" +#include "geom_curve.h" +#include "geom_triangle.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_inline float3 bvh_inverse_direction(float3 dir) +{ + /* avoid divide by zero (ooeps = exp2f(-80.0f)) */ + float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f; + float3 idir; + + idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x)); + idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y)); + idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z)); + + return idir; +} + +ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax) +{ + Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); + + *P = transform_point(&tfm, ray->P); + + float3 dir = transform_direction(&tfm, ray->D); + + float len; + dir = normalize_len(dir, &len); + + *idir = bvh_inverse_direction(dir); + + if(*t != FLT_MAX) + *t *= len; +} + +ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax) +{ + if(*t != FLT_MAX) { + Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); + *t *= len(transform_direction(&tfm, 1.0f/(*idir))); + } + + *P = ray->P; + *idir = bvh_inverse_direction(ray->D); +} + +#ifdef __OBJECT_MOTION__ +ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax) +{ + Transform itfm; + *tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm); + + *P = transform_point(&itfm, ray->P); + + float3 dir = transform_direction(&itfm, ray->D); + + float len; + dir = normalize_len(dir, &len); + + *idir = bvh_inverse_direction(dir); + + if(*t != FLT_MAX) + *t *= len; +} + +ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax) +{ + if(*t != FLT_MAX) + *t *= len(transform_direction(tfm, 1.0f/(*idir))); + + *P = ray->P; + *idir = bvh_inverse_direction(ray->D); +} +#endif + +/* Sven Woop's algorithm */ +ccl_device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect, + float3 P, float3 idir, uint visibility, int object, int triAddr) +{ + /* compute and check intersection t-value */ + float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0); + float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1); + float3 dir = 1.0f/idir; + + float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; + float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z); + float t = Oz * invDz; + + if(t > 0.0f && t < isect->t) { + /* compute and check barycentric u */ + float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z; + float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z; + float u = Ox + t*Dx; + + if(u >= 0.0f) { + /* compute and check barycentric v */ + float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2); + float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z; + float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z; + float v = Oy + t*Dy; + + if(v >= 0.0f && u + v <= 1.0f) { +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility) +#endif + { + /* record intersection */ + isect->prim = triAddr; + isect->object = object; + isect->u = u; + isect->v = v; + isect->t = t; + return true; + } + } + } + } + + return false; +} + +#ifdef __HAIR__ +ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, float *extrema, float *extremtb, float *extremb, float p0, float p1, float p2, float p3) +{ + float halfdiscroot = (p2 * p2 - 3 * p3 * p1); + float ta = -1.0f; + float tb = -1.0f; + *extremta = -1.0f; + *extremtb = -1.0f; + *upper = p0; + *lower = p0 + p1 + p2 + p3; + *extrema = *upper; + *extremb = *lower; + if(*lower >= *upper) { + *upper = *lower; + *lower = p0; + } + + if(halfdiscroot >= 0) { + halfdiscroot = sqrt(halfdiscroot); + ta = (-p2 - halfdiscroot) / (3 * p3); + tb = (-p2 + halfdiscroot) / (3 * p3); + } + + float t2; + float t3; + if(ta > 0.0f && ta < 1.0f) { + t2 = ta * ta; + t3 = t2 * ta; + *extremta = ta; + *extrema = p3 * t3 + p2 * t2 + p1 * ta + p0; + if(*extrema > *upper) { + *upper = *extrema; + } + if(*extrema < *lower) { + *lower = *extrema; + } + } + if(tb > 0.0f && tb < 1.0f) { + t2 = tb * tb; + t3 = t2 * tb; + *extremtb = tb; + *extremb = p3 * t3 + p2 * t2 + p1 * tb + p0; + if(*extremb >= *upper) { + *upper = *extremb; + } + if(*extremb <= *lower) { + *lower = *extremb; + } + } +} + +#ifdef __KERNEL_SSE2__ +ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a) +{ + return fma(broadcast<0>(a), t[0], fma(broadcast<1>(a), t[1], _mm_mul_ps(broadcast<2>(a), t[2]))); +} +#endif + +#ifdef __KERNEL_SSE2__ +/* Pass P and idir by reference to aligned vector */ +ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, + const float3 &P, const float3 &idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax) +#else +ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, + float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax) +#endif +{ + float epsilon = 0.0f; + float r_st, r_en; + + int depth = kernel_data.curve.subdivisions; + int flags = kernel_data.curve.curveflags; + int prim = kernel_tex_fetch(__prim_index, curveAddr); + +#ifdef __KERNEL_SSE2__ + __m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir)); + __m128 vcurve_coef[4]; + const float3 *curve_coef = (float3 *)vcurve_coef; + + { + __m128 dtmp = _mm_mul_ps(vdir, vdir); + __m128 d_ss = _mm_sqrt_ss(_mm_add_ss(dtmp, broadcast<2>(dtmp))); + __m128 rd_ss = _mm_div_ss(_mm_set_ss(1.0f), d_ss); + + __m128i v00vec = _mm_load_si128((__m128i *)&kg->__curves.data[prim]); + int2 &v00 = (int2 &)v00vec; + + int k0 = v00.x + segment; + int k1 = k0 + 1; + int ka = max(k0 - 1, v00.x); + int kb = min(k1 + 1, v00.x + v00.y - 1); + + __m128 P0 = _mm_load_ps(&kg->__curve_keys.data[ka].x); + __m128 P1 = _mm_load_ps(&kg->__curve_keys.data[k0].x); + __m128 P2 = _mm_load_ps(&kg->__curve_keys.data[k1].x); + __m128 P3 = _mm_load_ps(&kg->__curve_keys.data[kb].x); + + __m128 rd_sgn = set_sign_bit<0, 1, 1, 1>(broadcast<0>(rd_ss)); + __m128 mul_zxxy = _mm_mul_ps(shuffle<2, 0, 0, 1>(vdir), rd_sgn); + __m128 mul_yz = _mm_mul_ps(shuffle<1, 2, 1, 2>(vdir), mul_zxxy); + __m128 mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); + __m128 vdir0 = _mm_and_ps(vdir, _mm_castsi128_ps(_mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0))); + + __m128 htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); + __m128 htfm1 = shuffle<1, 0, 1, 3>(_mm_set_ss(_mm_cvtss_f32(d_ss)), vdir0); + __m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); + + __m128 htfm[] = { htfm0, htfm1, htfm2 }; + __m128 vP = load_m128(P); + __m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P0, vP)); + __m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P1, vP)); + __m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P2, vP)); + __m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P3, vP)); + + float fc = 0.71f; + __m128 vfc = _mm_set1_ps(fc); + __m128 vfcxp3 = _mm_mul_ps(vfc, p3); + + vcurve_coef[0] = p1; + vcurve_coef[1] = _mm_mul_ps(vfc, _mm_sub_ps(p2, p0)); + vcurve_coef[2] = fma(_mm_set1_ps(fc * 2.0f), p0, fma(_mm_set1_ps(fc - 3.0f), p1, fms(_mm_set1_ps(3.0f - 2.0f * fc), p2, vfcxp3))); + vcurve_coef[3] = fms(_mm_set1_ps(fc - 2.0f), _mm_sub_ps(p2, p1), fms(vfc, p0, vfcxp3)); + + r_st = ((float4 &)P1).w; + r_en = ((float4 &)P2).w; + } +#else + float3 curve_coef[4]; + + /* curve Intersection check */ + float3 dir = 1.0f/idir; + + /* obtain curve parameters */ + { + /* ray transform created - this should be created at beginning of intersection loop */ + Transform htfm; + float d = sqrtf(dir.x * dir.x + dir.z * dir.z); + htfm = make_transform( + dir.z / d, 0, -dir.x /d, 0, + -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0, + dir.x, dir.y, dir.z, 0, + 0, 0, 0, 1); + + float4 v00 = kernel_tex_fetch(__curves, prim); + + int k0 = __float_as_int(v00.x) + segment; + int k1 = k0 + 1; + + int ka = max(k0 - 1,__float_as_int(v00.x)); + int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); + + float4 P0 = kernel_tex_fetch(__curve_keys, ka); + float4 P1 = kernel_tex_fetch(__curve_keys, k0); + float4 P2 = kernel_tex_fetch(__curve_keys, k1); + float4 P3 = kernel_tex_fetch(__curve_keys, kb); + + float3 p0 = transform_point(&htfm, float4_to_float3(P0) - P); + float3 p1 = transform_point(&htfm, float4_to_float3(P1) - P); + float3 p2 = transform_point(&htfm, float4_to_float3(P2) - P); + float3 p3 = transform_point(&htfm, float4_to_float3(P3) - P); + + float fc = 0.71f; + curve_coef[0] = p1; + curve_coef[1] = -fc*p0 + fc*p2; + curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; + curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; + r_st = P1.w; + r_en = P2.w; + } +#endif + + float r_curr = max(r_st, r_en); + + if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) + epsilon = 2 * r_curr; + + /* find bounds - this is slow for cubic curves */ + float upper, lower; + + float zextrem[4]; + curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); + if(lower - r_curr > isect->t || upper + r_curr < epsilon) + return false; + + /* minimum width extension */ + float mw_extension = min(difl * fabsf(upper), extmax); + float r_ext = mw_extension + r_curr; + + float xextrem[4]; + curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); + if(lower > r_ext || upper < -r_ext) + return false; + + float yextrem[4]; + curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); + if(lower > r_ext || upper < -r_ext) + return false; + + /* setup recurrent loop */ + int level = 1 << depth; + int tree = 0; + float resol = 1.0f / (float)level; + bool hit = false; + + /* begin loop */ + while(!(tree >> (depth))) { + float i_st = tree * resol; + float i_en = i_st + (level * resol); +#ifdef __KERNEL_SSE2__ + __m128 vi_st = _mm_set1_ps(i_st), vi_en = _mm_set1_ps(i_en); + __m128 vp_st = fma(fma(fma(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); + __m128 vp_en = fma(fma(fma(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); + + __m128 vbmin = _mm_min_ps(vp_st, vp_en); + __m128 vbmax = _mm_max_ps(vp_st, vp_en); + + float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; + float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; + float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; + float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; +#else + float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0]; + float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0]; + + float bminx = min(p_st.x, p_en.x); + float bmaxx = max(p_st.x, p_en.x); + float bminy = min(p_st.y, p_en.y); + float bmaxy = max(p_st.y, p_en.y); + float bminz = min(p_st.z, p_en.z); + float bmaxz = max(p_st.z, p_en.z); +#endif + + if(xextrem[0] >= i_st && xextrem[0] <= i_en) { + bminx = min(bminx,xextrem[1]); + bmaxx = max(bmaxx,xextrem[1]); + } + if(xextrem[2] >= i_st && xextrem[2] <= i_en) { + bminx = min(bminx,xextrem[3]); + bmaxx = max(bmaxx,xextrem[3]); + } + if(yextrem[0] >= i_st && yextrem[0] <= i_en) { + bminy = min(bminy,yextrem[1]); + bmaxy = max(bmaxy,yextrem[1]); + } + if(yextrem[2] >= i_st && yextrem[2] <= i_en) { + bminy = min(bminy,yextrem[3]); + bmaxy = max(bmaxy,yextrem[3]); + } + if(zextrem[0] >= i_st && zextrem[0] <= i_en) { + bminz = min(bminz,zextrem[1]); + bmaxz = max(bmaxz,zextrem[1]); + } + if(zextrem[2] >= i_st && zextrem[2] <= i_en) { + bminz = min(bminz,zextrem[3]); + bmaxz = max(bmaxz,zextrem[3]); + } + + float r1 = r_st + (r_en - r_st) * i_st; + float r2 = r_st + (r_en - r_st) * i_en; + r_curr = max(r1, r2); + + mw_extension = min(difl * fabsf(bmaxz), extmax); + float r_ext = mw_extension + r_curr; + float coverage = 1.0f; + + if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { + /* the bounding box does not overlap the square centered at O */ + tree += level; + level = tree & -tree; + } + else if (level == 1) { + + /* the maximum recursion depth is reached. + * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. + * dP* is reversed if necessary.*/ + float t = isect->t; + float u = 0.0f; + if(flags & CURVE_KN_RIBBONS) { + float3 tg = (p_en - p_st); + float w = tg.x * tg.x + tg.y * tg.y; + if (w == 0) { + tree++; + level = tree & -tree; + continue; + } + w = -(p_st.x * tg.x + p_st.y * tg.y) / w; + w = clamp((float)w, 0.0f, 1.0f); + + /* compute u on the curve segment */ + u = i_st * (1 - w) + i_en * w; + r_curr = r_st + (r_en - r_st) * u; + /* compare x-y distances */ + float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; + + float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; + if (dot(tg, dp_st)< 0) + dp_st *= -1; + if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { + tree++; + level = tree & -tree; + continue; + } + float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; + if (dot(tg, dp_en) < 0) + dp_en *= -1; + if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { + tree++; + level = tree & -tree; + continue; + } + + /* compute coverage */ + float r_ext = r_curr; + coverage = 1.0f; + if(difl != 0.0f) { + mw_extension = min(difl * fabsf(bmaxz), extmax); + r_ext = mw_extension + r_curr; + float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); + float d0 = d - r_curr; + float d1 = d + r_curr; + if (d0 >= 0) + coverage = (min(d1 / mw_extension, 1.0f) - min(d0 / mw_extension, 1.0f)) * 0.5f; + else // inside + coverage = (min(d1 / mw_extension, 1.0f) + min(-d0 / mw_extension, 1.0f)) * 0.5f; + } + + if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { + tree++; + level = tree & -tree; + continue; + } + + t = p_curr.z; + } + else { + float l = len(p_en - p_st); + /* minimum width extension */ + float or1 = r1; + float or2 = r2; + if(difl != 0.0f) { + mw_extension = min(len(p_st - P) * difl, extmax); + or1 = r1 < mw_extension ? mw_extension : r1; + mw_extension = min(len(p_en - P) * difl, extmax); + or2 = r2 < mw_extension ? mw_extension : r2; + } + /* --- */ + float3 tg = (p_en - p_st) / l; + float gd = (or2 - or1) / l; + float difz = -dot(p_st,tg); + float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd)); + float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1))); + float tcentre = -halfb/cyla; + float zcentre = difz + (tg.z * tcentre); + float3 tdif = - p_st; + tdif.z += tcentre; + float tdifz = dot(tdif,tg); + float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); + float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; + float td = tb*tb - 4*cyla*tc; + if (td < 0.0f) { + tree++; + level = tree & -tree; + continue; + } + + float rootd = sqrtf(td); + float correction = ((-tb - rootd)/(2*cyla)); + t = tcentre + correction; + + float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; + if (dot(tg, dp_st)< 0) + dp_st *= -1; + float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; + if (dot(tg, dp_en) < 0) + dp_en *= -1; + + if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { + correction = ((-tb + rootd)/(2*cyla)); + t = tcentre + correction; + } + + if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { + tree++; + level = tree & -tree; + continue; + } + + float w = (zcentre + (tg.z * correction))/l; + w = clamp((float)w, 0.0f, 1.0f); + /* compute u on the curve segment */ + u = i_st * (1 - w) + i_en * w; + r_curr = r1 + (r2 - r1) * w; + r_ext = or1 + (or2 - or1) * w; + coverage = r_curr/r_ext; + + } + /* we found a new intersection */ + + /* stochastic fade from minimum width */ + if(lcg_state && coverage != 1.0f) { + if(lcg_step_float(lcg_state) > coverage) + return hit; + } + +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) +#endif + { + /* record intersection */ + isect->prim = curveAddr; + isect->segment = segment; + isect->object = object; + isect->u = u; + isect->v = 0.0f; + /*isect->v = 1.0f - coverage; */ + isect->t = t; + hit = true; + } + + tree++; + level = tree & -tree; + } + else { + /* split the curve into two curves and process */ + level = level >> 1; + } + } + + return hit; +} + +ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, + float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax) +{ + /* define few macros to minimize code duplication for SSE */ +#ifndef __KERNEL_SSE2__ +#define len3_squared(x) len_squared(x) +#define len3(x) len(x) +#define dot3(x, y) dot(x, y) +#endif + + /* curve Intersection check */ + int flags = kernel_data.curve.curveflags; + + int prim = kernel_tex_fetch(__prim_index, curveAddr); + float4 v00 = kernel_tex_fetch(__curves, prim); + + int cnum = __float_as_int(v00.x); + int k0 = cnum + segment; + int k1 = k0 + 1; + +#ifndef __KERNEL_SSE2__ + float4 P1 = kernel_tex_fetch(__curve_keys, k0); + float4 P2 = kernel_tex_fetch(__curve_keys, k1); + + float or1 = P1.w; + float or2 = P2.w; + float3 p1 = float4_to_float3(P1); + float3 p2 = float4_to_float3(P2); + + /* minimum width extension */ + float r1 = or1; + float r2 = or2; + float3 dif = P - p1; + float3 dif_second = P - p2; + if(difl != 0.0f) { + float pixelsize = min(len3(dif) * difl, extmax); + r1 = or1 < pixelsize ? pixelsize : or1; + pixelsize = min(len3(dif_second) * difl, extmax); + r2 = or2 < pixelsize ? pixelsize : or2; + } + /* --- */ + + float3 dir = 1.0f / idir; + float3 p21_diff = p2 - p1; + float3 sphere_dif1 = (dif + dif_second) * 0.5f; + float sphere_b_tmp = dot3(dir, sphere_dif1); + float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; +#else + const __m128 p1 = _mm_load_ps(&kg->__curve_keys.data[k0].x); + const __m128 p2 = _mm_load_ps(&kg->__curve_keys.data[k1].x); + const __m128 or12 = shuffle<3, 3, 3, 3>(p1, p2); + + __m128 r12 = or12; + const __m128 vP = load_m128(P); + const __m128 dif = _mm_sub_ps(vP, p1); + const __m128 dif_second = _mm_sub_ps(vP, p2); + if(difl != 0.0f) { + const __m128 len1_sq = len3_squared_splat(dif); + const __m128 len2_sq = len3_squared_splat(dif_second); + const __m128 len12 = _mm_sqrt_ps(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); + const __m128 pixelsize12 = _mm_min_ps(_mm_mul_ps(len12, _mm_set1_ps(difl)), _mm_set1_ps(extmax)); + r12 = _mm_max_ps(or12, pixelsize12); + } + float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12)); + float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12)); + + const __m128 dir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir)); + const __m128 p21_diff = _mm_sub_ps(p2, p1); + const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f)); + const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1); + const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1); +#endif + + float mr = max(r1, r2); + float l = len3(p21_diff); + float invl = 1.0f / l; + float sp_r = mr + 0.5f * l; + + float sphere_b = dot3(dir, sphere_dif2); + float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; + + if(sdisc < 0.0f) + return false; + + /* obtain parameters and test midpoint distance for suitable modes */ +#ifndef __KERNEL_SSE2__ + float3 tg = p21_diff * invl; +#else + const __m128 tg = _mm_mul_ps(p21_diff, _mm_set1_ps(invl)); +#endif + float gd = (r2 - r1) * invl; + + float dirz = dot3(dir, tg); + float difz = dot3(dif, tg); + + float a = 1.0f - (dirz*dirz*(1 + gd*gd)); + + float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1)); + + float tcentre = -halfb/a; + float zcentre = difz + (dirz * tcentre); + + if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) + return false; + if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) + return false; + + /* test minimum separation */ +#ifndef __KERNEL_SSE2__ + float3 cprod = cross(tg, dir); + float cprod2sq = len3_squared(cross(tg, dif)); +#else + const __m128 cprod = cross(tg, dir); + float cprod2sq = len3_squared(cross_zxy(tg, dif)); +#endif + float cprodsq = len3_squared(cprod); + float distscaled = dot3(cprod, dif); + + if(cprodsq == 0) + distscaled = cprod2sq; + else + distscaled = (distscaled*distscaled)/cprodsq; + + if(distscaled > mr*mr) + return false; + + /* calculate true intersection */ +#ifndef __KERNEL_SSE2__ + float3 tdif = dif + tcentre * dir; +#else + const __m128 tdif = fma(_mm_set1_ps(tcentre), dir, dif); +#endif + float tdifz = dot3(tdif, tg); + float tdifma = tdifz*gd + r1; + float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma)); + float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; + float td = tb*tb - 4*a*tc; + + if (td < 0.0f) + return false; + + float rootd = 0.0f; + float correction = 0.0f; + if(flags & CURVE_KN_ACCURATE) { + rootd = sqrtf(td); + correction = ((-tb - rootd)/(2*a)); + } + + float t = tcentre + correction; + + if(t < isect->t) { + + if(flags & CURVE_KN_INTERSECTCORRECTION) { + rootd = sqrtf(td); + correction = ((-tb - rootd)/(2*a)); + t = tcentre + correction; + } + + float z = zcentre + (dirz * correction); + bool backface = false; + + if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { + backface = true; + correction = ((-tb + rootd)/(2*a)); + t = tcentre + correction; + z = zcentre + (dirz * correction); + } + + /* stochastic fade from minimum width */ + float adjradius = or1 + z * (or2 - or1) * invl; + adjradius = adjradius / (r1 + z * gd); + if(lcg_state && adjradius != 1.0f) { + if(lcg_step_float(lcg_state) > adjradius) + return false; + } + /* --- */ + + if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { + + if (flags & CURVE_KN_ENCLOSEFILTER) { + float enc_ratio = 1.01f; + if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { + float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); + float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; + if(a2*c2 < 0.0f) + return false; + } + } + +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) +#endif + { + /* record intersection */ + isect->prim = curveAddr; + isect->segment = segment; + isect->object = object; + isect->u = z*invl; + isect->v = td/(4*a*a); + /*isect->v = 1.0f - adjradius;*/ + isect->t = t; + + if(backface) + isect->u = -isect->u; + + return true; + } + } + } + + return false; + +#ifndef __KERNEL_SSE2__ +#undef len3_squared +#undef len3 +#undef dot3 +#endif +} +#endif + +#ifdef __SUBSURFACE__ +/* Special ray intersection routines for subsurface scattering. In that case we + * only want to intersect with primitives in the same object, and if case of + * multiple hits we pick a single random primitive as the intersection point. */ + +ccl_device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array, + float3 P, float3 idir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits) +{ + /* compute and check intersection t-value */ + float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0); + float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1); + float3 dir = 1.0f/idir; + + float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; + float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z); + float t = Oz * invDz; + + if(t > 0.0f && t < tmax) { + /* compute and check barycentric u */ + float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z; + float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z; + float u = Ox + t*Dx; + + if(u >= 0.0f) { + /* compute and check barycentric v */ + float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2); + float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z; + float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z; + float v = Oy + t*Dy; + + if(v >= 0.0f && u + v <= 1.0f) { + (*num_hits)++; + + int hit; + + if(*num_hits <= max_hits) { + hit = *num_hits - 1; + } + else { + /* reservoir sampling: if we are at the maximum number of + * hits, randomly replace element or skip it */ + hit = lcg_step_uint(lcg_state) % *num_hits; + + if(hit >= max_hits) + return; + } + + /* record intersection */ + Intersection *isect = &isect_array[hit]; + isect->prim = triAddr; + isect->object = object; + isect->u = u; + isect->v = v; + isect->t = t; + } + } + } +} +#endif + +/* BVH intersection function variations */ + +#define BVH_INSTANCING 1 +#define BVH_MOTION 2 +#define BVH_HAIR 4 +#define BVH_HAIR_MINIMUM_WIDTH 8 + +#define BVH_FUNCTION_NAME bvh_intersect +#define BVH_FUNCTION_FEATURES 0 +#include "geom_bvh_traversal.h" + +#if defined(__INSTANCING__) +#define BVH_FUNCTION_NAME bvh_intersect_instancing +#define BVH_FUNCTION_FEATURES BVH_INSTANCING +#include "geom_bvh_traversal.h" +#endif + +#if defined(__HAIR__) +#define BVH_FUNCTION_NAME bvh_intersect_hair +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH +#include "geom_bvh_traversal.h" +#endif + +#if defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_motion +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION +#include "geom_bvh_traversal.h" +#endif + +#if defined(__HAIR__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_hair_motion +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION +#include "geom_bvh_traversal.h" +#endif + +#if defined(__SUBSURFACE__) +#define BVH_FUNCTION_NAME bvh_intersect_subsurface +#define BVH_FUNCTION_FEATURES 0 +#include "geom_bvh_subsurface.h" +#endif + +#if defined(__SUBSURFACE__) && defined(__INSTANCING__) +#define BVH_FUNCTION_NAME bvh_intersect_subsurface_instancing +#define BVH_FUNCTION_FEATURES BVH_INSTANCING +#include "geom_bvh_subsurface.h" +#endif + +#if defined(__SUBSURFACE__) && defined(__HAIR__) +#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR +#include "geom_bvh_subsurface.h" +#endif + +#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION +#include "geom_bvh_subsurface.h" +#endif + +#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair_motion +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION +#include "geom_bvh_subsurface.h" +#endif + +/* to work around titan bug when using arrays instead of textures */ +#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) +ccl_device_inline +#else +ccl_device_noinline +#endif +#ifdef __HAIR__ +bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax) +#else +bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect) +#endif +{ +#ifdef __OBJECT_MOTION__ + if(kernel_data.bvh.have_motion) { +#ifdef __HAIR__ + if(kernel_data.bvh.have_curves) + return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax); +#endif /* __HAIR__ */ + + return bvh_intersect_motion(kg, ray, isect, visibility); + } +#endif /* __OBJECT_MOTION__ */ + +#ifdef __HAIR__ + if(kernel_data.bvh.have_curves) + return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax); +#endif /* __HAIR__ */ + +#ifdef __KERNEL_CPU__ + +#ifdef __INSTANCING__ + if(kernel_data.bvh.have_instancing) + return bvh_intersect_instancing(kg, ray, isect, visibility); +#endif /* __INSTANCING__ */ + + return bvh_intersect(kg, ray, isect, visibility); +#else /* __KERNEL_CPU__ */ + +#ifdef __INSTANCING__ + return bvh_intersect_instancing(kg, ray, isect, visibility); +#else + return bvh_intersect(kg, ray, isect, visibility); +#endif /* __INSTANCING__ */ + +#endif /* __KERNEL_CPU__ */ +} + +/* to work around titan bug when using arrays instead of textures */ +#ifdef __SUBSURFACE__ +#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) +ccl_device_inline +#else +ccl_device_noinline +#endif +uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits) +{ +#ifdef __OBJECT_MOTION__ + if(kernel_data.bvh.have_motion) { +#ifdef __HAIR__ + if(kernel_data.bvh.have_curves) + return bvh_intersect_subsurface_hair_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits); +#endif /* __HAIR__ */ + + return bvh_intersect_subsurface_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits); + } +#endif /* __OBJECT_MOTION__ */ + +#ifdef __HAIR__ + if(kernel_data.bvh.have_curves) + return bvh_intersect_subsurface_hair(kg, ray, isect, subsurface_object, lcg_state, max_hits); +#endif /* __HAIR__ */ + +#ifdef __KERNEL_CPU__ + +#ifdef __INSTANCING__ + if(kernel_data.bvh.have_instancing) + return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits); +#endif /* __INSTANCING__ */ + + return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits); +#else /* __KERNEL_CPU__ */ + +#ifdef __INSTANCING__ + return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits); +#else + return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits); +#endif /* __INSTANCING__ */ + +#endif /* __KERNEL_CPU__ */ +} +#endif + +/* Ray offset to avoid self intersection */ + +ccl_device_inline float3 ray_offset(float3 P, float3 Ng) +{ +#ifdef __INTERSECTION_REFINE__ + const float epsilon_f = 1e-5f; + /* ideally this should match epsilon_f, but instancing/mblur + * precision makes it problematic */ + const float epsilon_test = 1.0f; + const int epsilon_i = 32; + + float3 res; + + /* x component */ + if(fabsf(P.x) < epsilon_test) { + res.x = P.x + Ng.x*epsilon_f; + } + else { + uint ix = __float_as_uint(P.x); + ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i; + res.x = __uint_as_float(ix); + } + + /* y component */ + if(fabsf(P.y) < epsilon_test) { + res.y = P.y + Ng.y*epsilon_f; + } + else { + uint iy = __float_as_uint(P.y); + iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i; + res.y = __uint_as_float(iy); + } + + /* z component */ + if(fabsf(P.z) < epsilon_test) { + res.z = P.z + Ng.z*epsilon_f; + } + else { + uint iz = __float_as_uint(P.z); + iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i; + res.z = __uint_as_float(iz); + } + + return res; +#else + const float epsilon_f = 1e-4f; + return P + epsilon_f*Ng; +#endif +} + +/* Refine triangle intersection to more precise hit point. For rays that travel + * far the precision is often not so good, this reintersects the primitive from + * a closer distance. */ + +ccl_device_inline float3 bvh_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray) +{ + float3 P = ray->P; + float3 D = ray->D; + float t = isect->t; + +#ifdef __INTERSECTION_REFINE__ + if(isect->object != ~0) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_itfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + D = transform_direction(&tfm, D*t); + D = normalize_len(D, &t); + } + + P = P + D*t; + + float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0); + float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; + float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z); + float rt = Oz * invDz; + + P = P + D*rt; + + if(isect->object != ~0) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_tfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + } + + return P; +#else + return P + D*t; +#endif +} + +/* same as above, except that isect->t is assumed to be in object space for instancing */ +ccl_device_inline float3 bvh_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray) +{ + float3 P = ray->P; + float3 D = ray->D; + float t = isect->t; + +#ifdef __INTERSECTION_REFINE__ + if(isect->object != ~0) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_itfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + D = transform_direction(&tfm, D); + D = normalize(D); + } + + P = P + D*t; + + float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0); + float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; + float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z); + float rt = Oz * invDz; + + P = P + D*rt; + + if(isect->object != ~0) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_tfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + } + + return P; +#else + return P + D*t; +#endif +} + +#ifdef __HAIR__ + +ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) +{ + float fc = 0.71f; + float data[4]; + float t2 = t * t; + data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; + data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; + data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; + data[3] = 3.0f * fc * t2 - 2.0f * fc * t; + return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; +} + +ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) +{ + float data[4]; + float fc = 0.71f; + float t2 = t * t; + float t3 = t2 * t; + data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; + data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; + data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; + data[3] = fc * t3 - fc * t2; + return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; +} + +ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray) +{ + int flag = kernel_data.curve.curveflags; + float t = isect->t; + float3 P = ray->P; + float3 D = ray->D; + + if(isect->object != ~0) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_itfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + D = transform_direction(&tfm, D*t); + D = normalize_len(D, &t); + } + + int prim = kernel_tex_fetch(__prim_index, isect->prim); + float4 v00 = kernel_tex_fetch(__curves, prim); + + int k0 = __float_as_int(v00.x) + isect->segment; + int k1 = k0 + 1; + + float4 P1 = kernel_tex_fetch(__curve_keys, k0); + float4 P2 = kernel_tex_fetch(__curve_keys, k1); + float l = 1.0f; + float3 tg = normalize_len(float4_to_float3(P2 - P1), &l); + float r1 = P1.w; + float r2 = P2.w; + float gd = ((r2 - r1)/l); + + P = P + D*t; + + if(flag & CURVE_KN_INTERPOLATE) { + int ka = max(k0 - 1,__float_as_int(v00.x)); + int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); + + float4 P0 = kernel_tex_fetch(__curve_keys, ka); + float4 P3 = kernel_tex_fetch(__curve_keys, kb); + + float3 p[4]; + p[0] = float4_to_float3(P0); + p[1] = float4_to_float3(P1); + p[2] = float4_to_float3(P2); + p[3] = float4_to_float3(P3); + +#ifdef __UV__ + sd->u = isect->u; + sd->v = 0.0f; +#endif + + tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); + + if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) + sd->Ng = normalize(-(D - tg * (dot(tg, D)))); + else { + float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); + sd->Ng = normalize(P - p_curr); + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); + } + sd->N = sd->Ng; + } + else { + float3 dif = P - float4_to_float3(P1); + +#ifdef __UV__ + sd->u = dot(dif,tg)/l; + sd->v = 0.0f; +#endif + + if (flag & CURVE_KN_TRUETANGENTGNORMAL) { + sd->Ng = -(D - tg * dot(tg, D)); + sd->Ng = normalize(sd->Ng); + } + else { + sd->Ng = (dif - tg * sd->u * l) / (P1.w + sd->u * l * gd); + if (gd != 0.0f) { + sd->Ng = sd->Ng - gd * tg ; + sd->Ng = normalize(sd->Ng); + } + } + + sd->N = sd->Ng; + } + +#ifdef __DPDU__ + /* dPdu/dPdv */ + sd->dPdu = tg; + sd->dPdv = cross(tg, sd->Ng); +#endif + + /*add fading parameter for minimum pixel width with transparency bsdf*/ + /*sd->curve_transparency = isect->v;*/ + /*sd->curve_radius = sd->u * gd * l + r1;*/ + + if(isect->object != ~0) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_tfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + } + + return P; +} +#endif + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h new file mode 100644 index 00000000000..40683a2da57 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h @@ -0,0 +1,294 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2013, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for subsurface scattering, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_MOTION: motion blur rendering + * + */ + +#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0) + +ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, + int subsurface_object, uint *lcg_state, int max_hits) +{ + /* todo: + * - test if pushing distance on the stack helps (for non shadow rays) + * - separate version for shadow rays + * - likely and unlikely for if() statements + * - SSE for hair + * - test restrict attribute for pointers + */ + + /* traversal stack in CUDA thread-local memory */ + int traversalStack[BVH_STACK_SIZE]; + traversalStack[0] = ENTRYPOINT_SENTINEL; + + /* traversal variables in registers */ + int stackPtr = 0; + int nodeAddr = kernel_data.bvh.root; + + /* ray parameters in registers */ + const float tmax = ray->t; + float3 P = ray->P; + float3 idir = bvh_inverse_direction(ray->D); + int object = ~0; + float isect_t = tmax; + + const uint visibility = ~0; + uint num_hits = 0; + +#if FEATURE(BVH_MOTION) + Transform ob_tfm; +#endif + +#if defined(__KERNEL_SSE2__) + const shuffle_swap_t shuf_identity = shuffle_swap_identity(); + const shuffle_swap_t shuf_swap = shuffle_swap_swap(); + + const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); + __m128 Psplat[3], idirsplat[3]; + shuffle_swap_t shufflexyz[3]; + + Psplat[0] = _mm_set_ps1(P.x); + Psplat[1] = _mm_set_ps1(P.y); + Psplat[2] = _mm_set_ps1(P.z); + + __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + /* traversal loop */ + do { + do + { + /* traverse internal nodes */ + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) + { + bool traverseChild0, traverseChild1; + int nodeAddrChild1; + +#if !defined(__KERNEL_SSE2__) + /* Intersect two child bounding boxes, non-SSE version */ + float t = isect_t; + + /* fetch node data */ + float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); + float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); + float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); + + /* intersect ray against child nodes */ + NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + + /* decide which nodes to traverse next */ +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); +#else + traverseChild0 = (c0max >= c0min); + traverseChild1 = (c1max >= c1min); +#endif + +#else // __KERNEL_SSE2__ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + + /* fetch node data */ + const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const float4 cnodes = ((float4*)bvh_nodes)[3]; + + /* intersect ray against child nodes */ + const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); + const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); + const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); + + const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn); + const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); + + /* decide which nodes to traverse next */ +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); +#else + traverseChild0 = (_mm_movemask_ps(lrhit) & 1); + traverseChild1 = (_mm_movemask_ps(lrhit) & 2); +#endif +#endif // __KERNEL_SSE2__ + + nodeAddr = __float_as_int(cnodes.x); + nodeAddrChild1 = __float_as_int(cnodes.y); + + if(traverseChild0 && traverseChild1) { + /* both children were intersected, push the farther one */ +#if !defined(__KERNEL_SSE2__) + bool closestChild1 = (c1min < c0min); +#else + union { __m128 m128; float v[4]; } uminmax; + uminmax.m128 = tminmax; + bool closestChild1 = uminmax.v[1] < uminmax.v[0]; +#endif + + if(closestChild1) { + int tmp = nodeAddr; + nodeAddr = nodeAddrChild1; + nodeAddrChild1 = tmp; + } + + ++stackPtr; + traversalStack[stackPtr] = nodeAddrChild1; + } + else { + /* one child was intersected */ + if(traverseChild1) { + nodeAddr = nodeAddrChild1; + } + else if(!traverseChild0) { + /* neither child was intersected */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + } + } + + /* if node is leaf, fetch triangle list */ + if(nodeAddr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1)); + int primAddr = __float_as_int(leaf.x); + +#if FEATURE(BVH_INSTANCING) + if(primAddr >= 0) { +#endif + int primAddr2 = __float_as_int(leaf.y); + + /* pop */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + + /* primitive intersection */ + for(; primAddr < primAddr2; primAddr++) { +#if FEATURE(BVH_HAIR) + uint segment = kernel_tex_fetch(__prim_segment, primAddr); + if(segment != ~0) + continue; +#endif + + /* only primitives from the same object */ + uint tri_object = (object == ~0)? kernel_tex_fetch(__prim_object, primAddr): object; + + if(tri_object == subsurface_object) { + + /* intersect ray against primitive */ + bvh_triangle_intersect_subsurface(kg, isect_array, P, idir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); + } + } + } +#if FEATURE(BVH_INSTANCING) + else { + /* instance push */ + if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) { + object = subsurface_object; + +#if FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax); +#else + bvh_instance_push(kg, object, ray, &P, &idir, &isect_t, tmax); +#endif + +#if defined(__KERNEL_SSE2__) + Psplat[0] = _mm_set_ps1(P.x); + Psplat[1] = _mm_set_ps1(P.y); + Psplat[2] = _mm_set_ps1(P.z); + + tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + ++stackPtr; + traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + + nodeAddr = kernel_tex_fetch(__object_node, object); + } + else { + /* pop */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + } + } +#endif + } while(nodeAddr != ENTRYPOINT_SENTINEL); + +#if FEATURE(BVH_INSTANCING) + if(stackPtr >= 0) { + kernel_assert(object != ~0); + + /* instance pop */ +#if FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax); +#else + bvh_instance_pop(kg, object, ray, &P, &idir, &isect_t, tmax); +#endif + +#if defined(__KERNEL_SSE2__) + Psplat[0] = _mm_set_ps1(P.x); + Psplat[1] = _mm_set_ps1(P.y); + Psplat[2] = _mm_set_ps1(P.z); + + tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + object = ~0; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } +#endif + } while(nodeAddr != ENTRYPOINT_SENTINEL); + + return num_hits; +} + +#undef FEATURE +#undef BVH_FUNCTION_NAME +#undef BVH_FUNCTION_FEATURES + diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h new file mode 100644 index 00000000000..0515a9e0fa7 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h @@ -0,0 +1,354 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2013, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function, where various features can be + * enabled/disabled. This way we can compile optimized versions for each case + * without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width + * BVH_MOTION: motion blur rendering + * + */ + +#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0) + +ccl_device bool BVH_FUNCTION_NAME +(KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility +#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) +, uint *lcg_state, float difl, float extmax +#endif +) +{ + /* todo: + * - test if pushing distance on the stack helps (for non shadow rays) + * - separate version for shadow rays + * - likely and unlikely for if() statements + * - SSE for hair + * - test restrict attribute for pointers + */ + + /* traversal stack in CUDA thread-local memory */ + int traversalStack[BVH_STACK_SIZE]; + traversalStack[0] = ENTRYPOINT_SENTINEL; + + /* traversal variables in registers */ + int stackPtr = 0; + int nodeAddr = kernel_data.bvh.root; + + /* ray parameters in registers */ + const float tmax = ray->t; + float3 P = ray->P; + float3 idir = bvh_inverse_direction(ray->D); + int object = ~0; + +#if FEATURE(BVH_MOTION) + Transform ob_tfm; +#endif + + isect->t = tmax; + isect->object = ~0; + isect->prim = ~0; + isect->u = 0.0f; + isect->v = 0.0f; + +#if defined(__KERNEL_SSE2__) + const shuffle_swap_t shuf_identity = shuffle_swap_identity(); + const shuffle_swap_t shuf_swap = shuffle_swap_swap(); + + const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); + __m128 Psplat[3], idirsplat[3]; + shuffle_swap_t shufflexyz[3]; + + Psplat[0] = _mm_set_ps1(P.x); + Psplat[1] = _mm_set_ps1(P.y); + Psplat[2] = _mm_set_ps1(P.z); + + __m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + /* traversal loop */ + do { + do + { + /* traverse internal nodes */ + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) + { + bool traverseChild0, traverseChild1; + int nodeAddrChild1; + +#if !defined(__KERNEL_SSE2__) + /* Intersect two child bounding boxes, non-SSE version */ + float t = isect->t; + + /* fetch node data */ + float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); + float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); + float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); + + /* intersect ray against child nodes */ + NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + +#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) + if(difl != 0.0f) { + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } +#endif + + /* decide which nodes to traverse next */ +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); +#else + traverseChild0 = (c0max >= c0min); + traverseChild1 = (c1max >= c1min); +#endif + +#else // __KERNEL_SSE2__ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + + /* fetch node data */ + const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const float4 cnodes = ((float4*)bvh_nodes)[3]; + + /* intersect ray against child nodes */ + const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); + const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); + const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); + + /* calculate { c0min, c1min, -c0max, -c1max} */ + __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)); + const __m128 tminmax = _mm_xor_ps(minmax, pn); + +#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) + if(difl != 0.0f) { + float4 *tminmaxview = (float4*)&tminmax; + float &c0min = tminmaxview->x, &c1min = tminmaxview->y; + float &c0max = tminmaxview->z, &c1max = tminmaxview->w; + + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } +#endif + + const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); + + /* decide which nodes to traverse next */ +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); +#else + traverseChild0 = (_mm_movemask_ps(lrhit) & 1); + traverseChild1 = (_mm_movemask_ps(lrhit) & 2); +#endif +#endif // __KERNEL_SSE2__ + + nodeAddr = __float_as_int(cnodes.x); + nodeAddrChild1 = __float_as_int(cnodes.y); + + if(traverseChild0 && traverseChild1) { + /* both children were intersected, push the farther one */ +#if !defined(__KERNEL_SSE2__) + bool closestChild1 = (c1min < c0min); +#else + union { __m128 m128; float v[4]; } uminmax; + uminmax.m128 = tminmax; + bool closestChild1 = uminmax.v[1] < uminmax.v[0]; +#endif + + if(closestChild1) { + int tmp = nodeAddr; + nodeAddr = nodeAddrChild1; + nodeAddrChild1 = tmp; + } + + ++stackPtr; + traversalStack[stackPtr] = nodeAddrChild1; + } + else { + /* one child was intersected */ + if(traverseChild1) { + nodeAddr = nodeAddrChild1; + } + else if(!traverseChild0) { + /* neither child was intersected */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + } + } + + /* if node is leaf, fetch triangle list */ + if(nodeAddr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1)); + int primAddr = __float_as_int(leaf.x); + +#if FEATURE(BVH_INSTANCING) + if(primAddr >= 0) { +#endif + int primAddr2 = __float_as_int(leaf.y); + + /* pop */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + + /* primitive intersection */ + while(primAddr < primAddr2) { + bool hit; + + /* intersect ray against primitive */ +#if FEATURE(BVH_HAIR) + uint segment = kernel_tex_fetch(__prim_segment, primAddr); + if(segment != ~0) { + + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) +#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) + hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax); + else + hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax); +#else + hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment); + else + hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment); +#endif + } + else +#endif + hit = bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr); + + /* shadow ray early termination */ +#if defined(__KERNEL_SSE2__) + if(hit) { + if(visibility == PATH_RAY_SHADOW_OPAQUE) + return true; + + tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + } +#else + if(hit && visibility == PATH_RAY_SHADOW_OPAQUE) + return true; +#endif + + primAddr++; + } + } +#if FEATURE(BVH_INSTANCING) + else { + /* instance push */ + object = kernel_tex_fetch(__prim_object, -primAddr-1); + +#if FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax); +#else + bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax); +#endif + +#if defined(__KERNEL_SSE2__) + Psplat[0] = _mm_set_ps1(P.x); + Psplat[1] = _mm_set_ps1(P.y); + Psplat[2] = _mm_set_ps1(P.z); + + tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + ++stackPtr; + traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + + nodeAddr = kernel_tex_fetch(__object_node, object); + } + } +#endif + } while(nodeAddr != ENTRYPOINT_SENTINEL); + +#if FEATURE(BVH_INSTANCING) + if(stackPtr >= 0) { + kernel_assert(object != ~0); + + /* instance pop */ +#if FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax); +#else + bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax); +#endif + +#if defined(__KERNEL_SSE2__) + Psplat[0] = _mm_set_ps1(P.x); + Psplat[1] = _mm_set_ps1(P.y); + Psplat[2] = _mm_set_ps1(P.z); + + tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + object = ~0; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } +#endif + } while(nodeAddr != ENTRYPOINT_SENTINEL); + + return (isect->prim != ~0); +} + +#undef FEATURE +#undef BVH_FUNCTION_NAME +#undef BVH_FUNCTION_FEATURES + diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h new file mode 100644 index 00000000000..821ac50eaa9 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -0,0 +1,137 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __HAIR__ + +/* curve attributes */ + +ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy) +{ + if(elem == ATTR_ELEMENT_CURVE) { +#ifdef __RAY_DIFFERENTIALS__ + if(dx) *dx = 0.0f; + if(dy) *dy = 0.0f; +#endif + + return kernel_tex_fetch(__attributes_float, offset + sd->prim); + } + else if(elem == ATTR_ELEMENT_CURVE_KEY) { + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + sd->segment; + int k1 = k0 + 1; + + float f0 = kernel_tex_fetch(__attributes_float, offset + k0); + float f1 = kernel_tex_fetch(__attributes_float, offset + k1); + +#ifdef __RAY_DIFFERENTIALS__ + if(dx) *dx = sd->du.dx*(f1 - f0); + if(dy) *dy = 0.0f; +#endif + + return (1.0f - sd->u)*f0 + sd->u*f1; + } + else { +#ifdef __RAY_DIFFERENTIALS__ + if(dx) *dx = 0.0f; + if(dy) *dy = 0.0f; +#endif + + return 0.0f; + } +} + +ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy) +{ + if(elem == ATTR_ELEMENT_CURVE) { + /* idea: we can't derive any useful differentials here, but for tiled + * mipmap image caching it would be useful to avoid reading the highest + * detail level always. maybe a derivative based on the hair density + * could be computed somehow? */ +#ifdef __RAY_DIFFERENTIALS__ + if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); + if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); +#endif + + return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim)); + } + else if(elem == ATTR_ELEMENT_CURVE_KEY) { + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + sd->segment; + int k1 = k0 + 1; + + float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0)); + float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1)); + +#ifdef __RAY_DIFFERENTIALS__ + if(dx) *dx = sd->du.dx*(f1 - f0); + if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); +#endif + + return (1.0f - sd->u)*f0 + sd->u*f1; + } + else { +#ifdef __RAY_DIFFERENTIALS__ + if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); + if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); +#endif + + return make_float3(0.0f, 0.0f, 0.0f); + } +} + +/* hair info node functions */ + +ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) +{ + float r = 0.0f; + + if(sd->segment != ~0) { + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + sd->segment; + int k1 = k0 + 1; + + float4 P1 = kernel_tex_fetch(__curve_keys, k0); + float4 P2 = kernel_tex_fetch(__curve_keys, k1); + r = (P2.w - P1.w) * sd->u + P1.w; + } + + return r*2.0f; +} + +ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd) +{ + float3 tgN = make_float3(0.0f,0.0f,0.0f); + + if(sd->segment != ~0) { + + tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu))); + tgN = normalize(tgN); + + /* need to find suitable scaled gd for corrected normal */ +#if 0 + tgN = normalize(tgN - gd * sd->dPdu); +#endif + } + + return tgN; +} + +#endif + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h new file mode 100644 index 00000000000..a66277e10cd --- /dev/null +++ b/intern/cycles/kernel/geom/geom_object.h @@ -0,0 +1,300 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +enum ObjectTransform { + OBJECT_TRANSFORM = 0, + OBJECT_TRANSFORM_MOTION_PRE = 0, + OBJECT_INVERSE_TRANSFORM = 4, + OBJECT_TRANSFORM_MOTION_POST = 4, + OBJECT_PROPERTIES = 8, + OBJECT_DUPLI = 9 +}; + +enum ObjectVectorTransform { + OBJECT_VECTOR_MOTION_PRE = 0, + OBJECT_VECTOR_MOTION_POST = 3 +}; + +ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object, enum ObjectTransform type) +{ + int offset = object*OBJECT_SIZE + (int)type; + + Transform tfm; + tfm.x = kernel_tex_fetch(__objects, offset + 0); + tfm.y = kernel_tex_fetch(__objects, offset + 1); + tfm.z = kernel_tex_fetch(__objects, offset + 2); + tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); + + return tfm; +} + +ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type) +{ + int offset = object*OBJECT_VECTOR_SIZE + (int)type; + + Transform tfm; + tfm.x = kernel_tex_fetch(__objects_vector, offset + 0); + tfm.y = kernel_tex_fetch(__objects_vector, offset + 1); + tfm.z = kernel_tex_fetch(__objects_vector, offset + 2); + tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); + + return tfm; +} + +#ifdef __OBJECT_MOTION__ +ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int object, float time) +{ + DecompMotionTransform motion; + + int offset = object*OBJECT_SIZE + (int)OBJECT_TRANSFORM_MOTION_PRE; + + motion.mid.x = kernel_tex_fetch(__objects, offset + 0); + motion.mid.y = kernel_tex_fetch(__objects, offset + 1); + motion.mid.z = kernel_tex_fetch(__objects, offset + 2); + motion.mid.w = kernel_tex_fetch(__objects, offset + 3); + + motion.pre_x = kernel_tex_fetch(__objects, offset + 4); + motion.pre_y = kernel_tex_fetch(__objects, offset + 5); + motion.post_x = kernel_tex_fetch(__objects, offset + 6); + motion.post_y = kernel_tex_fetch(__objects, offset + 7); + + Transform tfm; + transform_motion_interpolate(&tfm, &motion, time); + + return tfm; +} + +ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg, int object, float time, Transform *itfm) +{ + int object_flag = kernel_tex_fetch(__object_flag, object); + + if(object_flag & SD_OBJECT_MOTION) { + /* if we do motion blur */ + Transform tfm = object_fetch_transform_motion(kg, object, time); + + if(itfm) + *itfm = transform_quick_inverse(tfm); + + return tfm; + } + else { + Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); + if(itfm) + *itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); + + return tfm; + } +} +#endif + +ccl_device_inline void object_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P) +{ +#ifdef __OBJECT_MOTION__ + *P = transform_point(&sd->ob_tfm, *P); +#else + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + *P = transform_point(&tfm, *P); +#endif +} + +ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P) +{ +#ifdef __OBJECT_MOTION__ + *P = transform_point(&sd->ob_itfm, *P); +#else + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + *P = transform_point(&tfm, *P); +#endif +} + +ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N) +{ +#ifdef __OBJECT_MOTION__ + *N = normalize(transform_direction_transposed(&sd->ob_tfm, *N)); +#else + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + *N = normalize(transform_direction_transposed(&tfm, *N)); +#endif +} + +ccl_device_inline void object_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N) +{ +#ifdef __OBJECT_MOTION__ + *N = normalize(transform_direction_transposed(&sd->ob_itfm, *N)); +#else + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + *N = normalize(transform_direction_transposed(&tfm, *N)); +#endif +} + +ccl_device_inline void object_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D) +{ +#ifdef __OBJECT_MOTION__ + *D = transform_direction(&sd->ob_tfm, *D); +#else + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + *D = transform_direction(&tfm, *D); +#endif +} + +ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D) +{ +#ifdef __OBJECT_MOTION__ + *D = transform_direction(&sd->ob_itfm, *D); +#else + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + *D = transform_direction(&tfm, *D); +#endif +} + +ccl_device_inline float3 object_location(KernelGlobals *kg, ShaderData *sd) +{ + if(sd->object == ~0) + return make_float3(0.0f, 0.0f, 0.0f); + +#ifdef __OBJECT_MOTION__ + return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w); +#else + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + return make_float3(tfm.x.w, tfm.y.w, tfm.z.w); +#endif +} + +ccl_device_inline float object_surface_area(KernelGlobals *kg, int object) +{ + int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; + float4 f = kernel_tex_fetch(__objects, offset); + return f.x; +} + +ccl_device_inline float object_pass_id(KernelGlobals *kg, int object) +{ + if(object == ~0) + return 0.0f; + + int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; + float4 f = kernel_tex_fetch(__objects, offset); + return f.y; +} + +ccl_device_inline float object_random_number(KernelGlobals *kg, int object) +{ + if(object == ~0) + return 0.0f; + + int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; + float4 f = kernel_tex_fetch(__objects, offset); + return f.z; +} + +ccl_device_inline uint object_particle_id(KernelGlobals *kg, int object) +{ + if(object == ~0) + return 0.0f; + + int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; + float4 f = kernel_tex_fetch(__objects, offset); + return __float_as_uint(f.w); +} + +ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object) +{ + if(object == ~0) + return make_float3(0.0f, 0.0f, 0.0f); + + int offset = object*OBJECT_SIZE + OBJECT_DUPLI; + float4 f = kernel_tex_fetch(__objects, offset); + return make_float3(f.x, f.y, f.z); +} + +ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object) +{ + if(object == ~0) + return make_float3(0.0f, 0.0f, 0.0f); + + int offset = object*OBJECT_SIZE + OBJECT_DUPLI; + float4 f = kernel_tex_fetch(__objects, offset + 1); + return make_float3(f.x, f.y, 0.0f); +} + + +ccl_device int shader_pass_id(KernelGlobals *kg, ShaderData *sd) +{ + return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1); +} + +ccl_device_inline float particle_index(KernelGlobals *kg, int particle) +{ + int offset = particle*PARTICLE_SIZE; + float4 f = kernel_tex_fetch(__particles, offset + 0); + return f.x; +} + +ccl_device float particle_age(KernelGlobals *kg, int particle) +{ + int offset = particle*PARTICLE_SIZE; + float4 f = kernel_tex_fetch(__particles, offset + 0); + return f.y; +} + +ccl_device float particle_lifetime(KernelGlobals *kg, int particle) +{ + int offset = particle*PARTICLE_SIZE; + float4 f = kernel_tex_fetch(__particles, offset + 0); + return f.z; +} + +ccl_device float particle_size(KernelGlobals *kg, int particle) +{ + int offset = particle*PARTICLE_SIZE; + float4 f = kernel_tex_fetch(__particles, offset + 0); + return f.w; +} + +ccl_device float4 particle_rotation(KernelGlobals *kg, int particle) +{ + int offset = particle*PARTICLE_SIZE; + float4 f = kernel_tex_fetch(__particles, offset + 1); + return f; +} + +ccl_device float3 particle_location(KernelGlobals *kg, int particle) +{ + int offset = particle*PARTICLE_SIZE; + float4 f = kernel_tex_fetch(__particles, offset + 2); + return make_float3(f.x, f.y, f.z); +} + +ccl_device float3 particle_velocity(KernelGlobals *kg, int particle) +{ + int offset = particle*PARTICLE_SIZE; + float4 f2 = kernel_tex_fetch(__particles, offset + 2); + float4 f3 = kernel_tex_fetch(__particles, offset + 3); + return make_float3(f2.w, f3.x, f3.y); +} + +ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle) +{ + int offset = particle*PARTICLE_SIZE; + float4 f3 = kernel_tex_fetch(__particles, offset + 3); + float4 f4 = kernel_tex_fetch(__particles, offset + 4); + return make_float3(f3.z, f3.w, f4.x); +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h new file mode 100644 index 00000000000..0455df85961 --- /dev/null +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -0,0 +1,180 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +/* Point on triangle for Moller-Trumbore triangles */ +ccl_device_inline float3 triangle_point_MT(KernelGlobals *kg, int tri_index, float u, float v) +{ + /* load triangle vertices */ + float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index)); + + float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); + float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); + float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + + /* compute point */ + float t = 1.0f - u - v; + return (u*v0 + v*v1 + t*v2); +} + +/* Normal for Moller-Trumbore triangles */ +ccl_device_inline float3 triangle_normal_MT(KernelGlobals *kg, int tri_index, int *shader) +{ +#if 0 + /* load triangle vertices */ + float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index)); + + float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); + float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); + float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + + /* compute normal */ + return normalize(cross(v2 - v0, v1 - v0)); +#else + float4 Nm = kernel_tex_fetch(__tri_normal, tri_index); + *shader = __float_as_int(Nm.w); + return make_float3(Nm.x, Nm.y, Nm.z); +#endif +} + +/* Return 3 triangle vertex locations */ +ccl_device_inline void triangle_vertices(KernelGlobals *kg, int tri_index, float3 P[3]) +{ + /* load triangle vertices */ + float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index)); + + P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); + P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); + P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); +} + +ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int tri_index, float u, float v) +{ + /* load triangle vertices */ + float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index)); + + float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x))); + float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y))); + float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z))); + + return normalize((1.0f - u - v)*n2 + u*n0 + v*n1); +} + +ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, float3 *dPdu, float3 *dPdv, int tri) +{ + /* fetch triangle vertex coordinates */ + float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri)); + + float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); + float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); + float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + + /* compute derivatives of P w.r.t. uv */ + *dPdu = (p0 - p2); + *dPdv = (p1 - p2); +} + +/* attributes */ + +ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy) +{ + if(elem == ATTR_ELEMENT_FACE) { + if(dx) *dx = 0.0f; + if(dy) *dy = 0.0f; + + return kernel_tex_fetch(__attributes_float, offset + sd->prim); + } + else if(elem == ATTR_ELEMENT_VERTEX) { + float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim)); + + float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x)); + float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y)); + float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z)); + +#ifdef __RAY_DIFFERENTIALS__ + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; +#endif + + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + } + else if(elem == ATTR_ELEMENT_CORNER) { + int tri = offset + sd->prim*3; + float f0 = kernel_tex_fetch(__attributes_float, tri + 0); + float f1 = kernel_tex_fetch(__attributes_float, tri + 1); + float f2 = kernel_tex_fetch(__attributes_float, tri + 2); + +#ifdef __RAY_DIFFERENTIALS__ + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; +#endif + + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + } + else { + if(dx) *dx = 0.0f; + if(dy) *dy = 0.0f; + + return 0.0f; + } +} + +ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy) +{ + if(elem == ATTR_ELEMENT_FACE) { + if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); + if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); + + return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim)); + } + else if(elem == ATTR_ELEMENT_VERTEX) { + float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim)); + + float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); + float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); + float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z))); + +#ifdef __RAY_DIFFERENTIALS__ + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; +#endif + + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + } + else if(elem == ATTR_ELEMENT_CORNER) { + int tri = offset + sd->prim*3; + float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0)); + float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1)); + float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2)); + +#ifdef __RAY_DIFFERENTIALS__ + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; +#endif + + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + } + else { + if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); + if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); + + return make_float3(0.0f, 0.0f, 0.0f); + } +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h deleted file mode 100644 index 942c7abce65..00000000000 --- a/intern/cycles/kernel/kernel_bvh.h +++ /dev/null @@ -1,1318 +0,0 @@ -/* - * Adapted from code Copyright 2009-2010 NVIDIA Corporation - * Modifications Copyright 2011, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* - * "Persistent while-while kernel" used in: - * - * "Understanding the Efficiency of Ray Traversal on GPUs", - * Timo Aila and Samuli Laine, - * Proc. High-Performance Graphics 2009 - */ - -/* bottom-most stack entry, indicating the end of traversal */ -#define ENTRYPOINT_SENTINEL 0x76543210 - -/* 64 object BVH + 64 mesh BVH + 64 object node splitting */ -#define BVH_STACK_SIZE 192 -#define BVH_NODE_SIZE 4 -#define TRI_NODE_SIZE 3 - -/* silly workaround for float extended precision that happens when compiling - * without sse support on x86, it results in different results for float ops - * that you would otherwise expect to compare correctly */ -#if !defined(__i386__) || defined(__SSE__) -#define NO_EXTENDED_PRECISION -#else -#define NO_EXTENDED_PRECISION volatile -#endif - -ccl_device_inline float3 bvh_inverse_direction(float3 dir) -{ - /* avoid divide by zero (ooeps = exp2f(-80.0f)) */ - float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f; - float3 idir; - - idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x)); - idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y)); - idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z)); - - return idir; -} - -ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax) -{ - Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - - *P = transform_point(&tfm, ray->P); - - float3 dir = transform_direction(&tfm, ray->D); - - float len; - dir = normalize_len(dir, &len); - - *idir = bvh_inverse_direction(dir); - - if(*t != FLT_MAX) - *t *= len; -} - -ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax) -{ - if(*t != FLT_MAX) { - Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); - *t *= len(transform_direction(&tfm, 1.0f/(*idir))); - } - - *P = ray->P; - *idir = bvh_inverse_direction(ray->D); -} - -#ifdef __OBJECT_MOTION__ -ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax) -{ - Transform itfm; - *tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm); - - *P = transform_point(&itfm, ray->P); - - float3 dir = transform_direction(&itfm, ray->D); - - float len; - dir = normalize_len(dir, &len); - - *idir = bvh_inverse_direction(dir); - - if(*t != FLT_MAX) - *t *= len; -} - -ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax) -{ - if(*t != FLT_MAX) - *t *= len(transform_direction(tfm, 1.0f/(*idir))); - - *P = ray->P; - *idir = bvh_inverse_direction(ray->D); -} -#endif - -/* Sven Woop's algorithm */ -ccl_device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 idir, uint visibility, int object, int triAddr) -{ - /* compute and check intersection t-value */ - float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0); - float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1); - float3 dir = 1.0f/idir; - - float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; - float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z); - float t = Oz * invDz; - - if(t > 0.0f && t < isect->t) { - /* compute and check barycentric u */ - float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z; - float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z; - float u = Ox + t*Dx; - - if(u >= 0.0f) { - /* compute and check barycentric v */ - float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2); - float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z; - float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z; - float v = Oy + t*Dy; - - if(v >= 0.0f && u + v <= 1.0f) { -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility) -#endif - { - /* record intersection */ - isect->prim = triAddr; - isect->object = object; - isect->u = u; - isect->v = v; - isect->t = t; - return true; - } - } - } - } - - return false; -} - -#ifdef __HAIR__ -ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, float *extrema, float *extremtb, float *extremb, float p0, float p1, float p2, float p3) -{ - float halfdiscroot = (p2 * p2 - 3 * p3 * p1); - float ta = -1.0f; - float tb = -1.0f; - *extremta = -1.0f; - *extremtb = -1.0f; - *upper = p0; - *lower = p0 + p1 + p2 + p3; - *extrema = *upper; - *extremb = *lower; - if(*lower >= *upper) { - *upper = *lower; - *lower = p0; - } - - if(halfdiscroot >= 0) { - halfdiscroot = sqrt(halfdiscroot); - ta = (-p2 - halfdiscroot) / (3 * p3); - tb = (-p2 + halfdiscroot) / (3 * p3); - } - - float t2; - float t3; - if(ta > 0.0f && ta < 1.0f) { - t2 = ta * ta; - t3 = t2 * ta; - *extremta = ta; - *extrema = p3 * t3 + p2 * t2 + p1 * ta + p0; - if(*extrema > *upper) { - *upper = *extrema; - } - if(*extrema < *lower) { - *lower = *extrema; - } - } - if(tb > 0.0f && tb < 1.0f) { - t2 = tb * tb; - t3 = t2 * tb; - *extremtb = tb; - *extremb = p3 * t3 + p2 * t2 + p1 * tb + p0; - if(*extremb >= *upper) { - *upper = *extremb; - } - if(*extremb <= *lower) { - *lower = *extremb; - } - } -} - -#ifdef __KERNEL_SSE2__ -ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a) -{ - return fma(broadcast<0>(a), t[0], fma(broadcast<1>(a), t[1], _mm_mul_ps(broadcast<2>(a), t[2]))); -} -#endif - -#ifdef __KERNEL_SSE2__ -/* Pass P and idir by reference to aligned vector */ -ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - const float3 &P, const float3 &idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax) -#else -ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax) -#endif -{ - float epsilon = 0.0f; - float r_st, r_en; - - int depth = kernel_data.curve.subdivisions; - int flags = kernel_data.curve.curveflags; - int prim = kernel_tex_fetch(__prim_index, curveAddr); - -#ifdef __KERNEL_SSE2__ - __m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir)); - __m128 vcurve_coef[4]; - const float3 *curve_coef = (float3 *)vcurve_coef; - - { - __m128 dtmp = _mm_mul_ps(vdir, vdir); - __m128 d_ss = _mm_sqrt_ss(_mm_add_ss(dtmp, broadcast<2>(dtmp))); - __m128 rd_ss = _mm_div_ss(_mm_set_ss(1.0f), d_ss); - - __m128i v00vec = _mm_load_si128((__m128i *)&kg->__curves.data[prim]); - int2 &v00 = (int2 &)v00vec; - - int k0 = v00.x + segment; - int k1 = k0 + 1; - int ka = max(k0 - 1, v00.x); - int kb = min(k1 + 1, v00.x + v00.y - 1); - - __m128 P0 = _mm_load_ps(&kg->__curve_keys.data[ka].x); - __m128 P1 = _mm_load_ps(&kg->__curve_keys.data[k0].x); - __m128 P2 = _mm_load_ps(&kg->__curve_keys.data[k1].x); - __m128 P3 = _mm_load_ps(&kg->__curve_keys.data[kb].x); - - __m128 rd_sgn = set_sign_bit<0, 1, 1, 1>(broadcast<0>(rd_ss)); - __m128 mul_zxxy = _mm_mul_ps(shuffle<2, 0, 0, 1>(vdir), rd_sgn); - __m128 mul_yz = _mm_mul_ps(shuffle<1, 2, 1, 2>(vdir), mul_zxxy); - __m128 mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); - __m128 vdir0 = _mm_and_ps(vdir, _mm_castsi128_ps(_mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0))); - - __m128 htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); - __m128 htfm1 = shuffle<1, 0, 1, 3>(_mm_set_ss(_mm_cvtss_f32(d_ss)), vdir0); - __m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); - - __m128 htfm[] = { htfm0, htfm1, htfm2 }; - __m128 vP = load_m128(P); - __m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P0, vP)); - __m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P1, vP)); - __m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P2, vP)); - __m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P3, vP)); - - float fc = 0.71f; - __m128 vfc = _mm_set1_ps(fc); - __m128 vfcxp3 = _mm_mul_ps(vfc, p3); - - vcurve_coef[0] = p1; - vcurve_coef[1] = _mm_mul_ps(vfc, _mm_sub_ps(p2, p0)); - vcurve_coef[2] = fma(_mm_set1_ps(fc * 2.0f), p0, fma(_mm_set1_ps(fc - 3.0f), p1, fms(_mm_set1_ps(3.0f - 2.0f * fc), p2, vfcxp3))); - vcurve_coef[3] = fms(_mm_set1_ps(fc - 2.0f), _mm_sub_ps(p2, p1), fms(vfc, p0, vfcxp3)); - - r_st = ((float4 &)P1).w; - r_en = ((float4 &)P2).w; - } -#else - float3 curve_coef[4]; - - /* curve Intersection check */ - float3 dir = 1.0f/idir; - - /* obtain curve parameters */ - { - /* ray transform created - this should be created at beginning of intersection loop */ - Transform htfm; - float d = sqrtf(dir.x * dir.x + dir.z * dir.z); - htfm = make_transform( - dir.z / d, 0, -dir.x /d, 0, - -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0, - dir.x, dir.y, dir.z, 0, - 0, 0, 0, 1); - - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + segment; - int k1 = k0 + 1; - - int ka = max(k0 - 1,__float_as_int(v00.x)); - int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P0 = kernel_tex_fetch(__curve_keys, ka); - float4 P1 = kernel_tex_fetch(__curve_keys, k0); - float4 P2 = kernel_tex_fetch(__curve_keys, k1); - float4 P3 = kernel_tex_fetch(__curve_keys, kb); - - float3 p0 = transform_point(&htfm, float4_to_float3(P0) - P); - float3 p1 = transform_point(&htfm, float4_to_float3(P1) - P); - float3 p2 = transform_point(&htfm, float4_to_float3(P2) - P); - float3 p3 = transform_point(&htfm, float4_to_float3(P3) - P); - - float fc = 0.71f; - curve_coef[0] = p1; - curve_coef[1] = -fc*p0 + fc*p2; - curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; - curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; - r_st = P1.w; - r_en = P2.w; - } -#endif - - float r_curr = max(r_st, r_en); - - if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) - epsilon = 2 * r_curr; - - /* find bounds - this is slow for cubic curves */ - float upper, lower; - - float zextrem[4]; - curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); - if(lower - r_curr > isect->t || upper + r_curr < epsilon) - return false; - - /* minimum width extension */ - float mw_extension = min(difl * fabsf(upper), extmax); - float r_ext = mw_extension + r_curr; - - float xextrem[4]; - curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); - if(lower > r_ext || upper < -r_ext) - return false; - - float yextrem[4]; - curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); - if(lower > r_ext || upper < -r_ext) - return false; - - /* setup recurrent loop */ - int level = 1 << depth; - int tree = 0; - float resol = 1.0f / (float)level; - bool hit = false; - - /* begin loop */ - while(!(tree >> (depth))) { - float i_st = tree * resol; - float i_en = i_st + (level * resol); -#ifdef __KERNEL_SSE2__ - __m128 vi_st = _mm_set1_ps(i_st), vi_en = _mm_set1_ps(i_en); - __m128 vp_st = fma(fma(fma(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); - __m128 vp_en = fma(fma(fma(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); - - __m128 vbmin = _mm_min_ps(vp_st, vp_en); - __m128 vbmax = _mm_max_ps(vp_st, vp_en); - - float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; - float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; - float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; - float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; -#else - float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0]; - float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0]; - - float bminx = min(p_st.x, p_en.x); - float bmaxx = max(p_st.x, p_en.x); - float bminy = min(p_st.y, p_en.y); - float bmaxy = max(p_st.y, p_en.y); - float bminz = min(p_st.z, p_en.z); - float bmaxz = max(p_st.z, p_en.z); -#endif - - if(xextrem[0] >= i_st && xextrem[0] <= i_en) { - bminx = min(bminx,xextrem[1]); - bmaxx = max(bmaxx,xextrem[1]); - } - if(xextrem[2] >= i_st && xextrem[2] <= i_en) { - bminx = min(bminx,xextrem[3]); - bmaxx = max(bmaxx,xextrem[3]); - } - if(yextrem[0] >= i_st && yextrem[0] <= i_en) { - bminy = min(bminy,yextrem[1]); - bmaxy = max(bmaxy,yextrem[1]); - } - if(yextrem[2] >= i_st && yextrem[2] <= i_en) { - bminy = min(bminy,yextrem[3]); - bmaxy = max(bmaxy,yextrem[3]); - } - if(zextrem[0] >= i_st && zextrem[0] <= i_en) { - bminz = min(bminz,zextrem[1]); - bmaxz = max(bmaxz,zextrem[1]); - } - if(zextrem[2] >= i_st && zextrem[2] <= i_en) { - bminz = min(bminz,zextrem[3]); - bmaxz = max(bmaxz,zextrem[3]); - } - - float r1 = r_st + (r_en - r_st) * i_st; - float r2 = r_st + (r_en - r_st) * i_en; - r_curr = max(r1, r2); - - mw_extension = min(difl * fabsf(bmaxz), extmax); - float r_ext = mw_extension + r_curr; - float coverage = 1.0f; - - if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { - /* the bounding box does not overlap the square centered at O */ - tree += level; - level = tree & -tree; - } - else if (level == 1) { - - /* the maximum recursion depth is reached. - * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. - * dP* is reversed if necessary.*/ - float t = isect->t; - float u = 0.0f; - if(flags & CURVE_KN_RIBBONS) { - float3 tg = (p_en - p_st); - float w = tg.x * tg.x + tg.y * tg.y; - if (w == 0) { - tree++; - level = tree & -tree; - continue; - } - w = -(p_st.x * tg.x + p_st.y * tg.y) / w; - w = clamp((float)w, 0.0f, 1.0f); - - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - r_curr = r_st + (r_en - r_st) * u; - /* compare x-y distances */ - float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if (dot(tg, dp_st)< 0) - dp_st *= -1; - if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { - tree++; - level = tree & -tree; - continue; - } - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if (dot(tg, dp_en) < 0) - dp_en *= -1; - if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { - tree++; - level = tree & -tree; - continue; - } - - /* compute coverage */ - float r_ext = r_curr; - coverage = 1.0f; - if(difl != 0.0f) { - mw_extension = min(difl * fabsf(bmaxz), extmax); - r_ext = mw_extension + r_curr; - float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); - float d0 = d - r_curr; - float d1 = d + r_curr; - if (d0 >= 0) - coverage = (min(d1 / mw_extension, 1.0f) - min(d0 / mw_extension, 1.0f)) * 0.5f; - else // inside - coverage = (min(d1 / mw_extension, 1.0f) + min(-d0 / mw_extension, 1.0f)) * 0.5f; - } - - if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { - tree++; - level = tree & -tree; - continue; - } - - t = p_curr.z; - } - else { - float l = len(p_en - p_st); - /* minimum width extension */ - float or1 = r1; - float or2 = r2; - if(difl != 0.0f) { - mw_extension = min(len(p_st - P) * difl, extmax); - or1 = r1 < mw_extension ? mw_extension : r1; - mw_extension = min(len(p_en - P) * difl, extmax); - or2 = r2 < mw_extension ? mw_extension : r2; - } - /* --- */ - float3 tg = (p_en - p_st) / l; - float gd = (or2 - or1) / l; - float difz = -dot(p_st,tg); - float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd)); - float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1))); - float tcentre = -halfb/cyla; - float zcentre = difz + (tg.z * tcentre); - float3 tdif = - p_st; - tdif.z += tcentre; - float tdifz = dot(tdif,tg); - float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); - float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; - float td = tb*tb - 4*cyla*tc; - if (td < 0.0f) { - tree++; - level = tree & -tree; - continue; - } - - float rootd = sqrtf(td); - float correction = ((-tb - rootd)/(2*cyla)); - t = tcentre + correction; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if (dot(tg, dp_st)< 0) - dp_st *= -1; - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if (dot(tg, dp_en) < 0) - dp_en *= -1; - - if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { - correction = ((-tb + rootd)/(2*cyla)); - t = tcentre + correction; - } - - if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { - tree++; - level = tree & -tree; - continue; - } - - float w = (zcentre + (tg.z * correction))/l; - w = clamp((float)w, 0.0f, 1.0f); - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - r_curr = r1 + (r2 - r1) * w; - r_ext = or1 + (or2 - or1) * w; - coverage = r_curr/r_ext; - - } - /* we found a new intersection */ - - /* stochastic fade from minimum width */ - if(lcg_state && coverage != 1.0f) { - if(lcg_step_float(lcg_state) > coverage) - return hit; - } - -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -#endif - { - /* record intersection */ - isect->prim = curveAddr; - isect->segment = segment; - isect->object = object; - isect->u = u; - isect->v = 0.0f; - /*isect->v = 1.0f - coverage; */ - isect->t = t; - hit = true; - } - - tree++; - level = tree & -tree; - } - else { - /* split the curve into two curves and process */ - level = level >> 1; - } - } - - return hit; -} - -ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax) -{ - /* define few macros to minimize code duplication for SSE */ -#ifndef __KERNEL_SSE2__ -#define len3_squared(x) len_squared(x) -#define len3(x) len(x) -#define dot3(x, y) dot(x, y) -#endif - - /* curve Intersection check */ - int flags = kernel_data.curve.curveflags; - - int prim = kernel_tex_fetch(__prim_index, curveAddr); - float4 v00 = kernel_tex_fetch(__curves, prim); - - int cnum = __float_as_int(v00.x); - int k0 = cnum + segment; - int k1 = k0 + 1; - -#ifndef __KERNEL_SSE2__ - float4 P1 = kernel_tex_fetch(__curve_keys, k0); - float4 P2 = kernel_tex_fetch(__curve_keys, k1); - - float or1 = P1.w; - float or2 = P2.w; - float3 p1 = float4_to_float3(P1); - float3 p2 = float4_to_float3(P2); - - /* minimum width extension */ - float r1 = or1; - float r2 = or2; - float3 dif = P - p1; - float3 dif_second = P - p2; - if(difl != 0.0f) { - float pixelsize = min(len3(dif) * difl, extmax); - r1 = or1 < pixelsize ? pixelsize : or1; - pixelsize = min(len3(dif_second) * difl, extmax); - r2 = or2 < pixelsize ? pixelsize : or2; - } - /* --- */ - - float3 dir = 1.0f / idir; - float3 p21_diff = p2 - p1; - float3 sphere_dif1 = (dif + dif_second) * 0.5f; - float sphere_b_tmp = dot3(dir, sphere_dif1); - float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; -#else - const __m128 p1 = _mm_load_ps(&kg->__curve_keys.data[k0].x); - const __m128 p2 = _mm_load_ps(&kg->__curve_keys.data[k1].x); - const __m128 or12 = shuffle<3, 3, 3, 3>(p1, p2); - - __m128 r12 = or12; - const __m128 vP = load_m128(P); - const __m128 dif = _mm_sub_ps(vP, p1); - const __m128 dif_second = _mm_sub_ps(vP, p2); - if(difl != 0.0f) { - const __m128 len1_sq = len3_squared_splat(dif); - const __m128 len2_sq = len3_squared_splat(dif_second); - const __m128 len12 = _mm_sqrt_ps(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); - const __m128 pixelsize12 = _mm_min_ps(_mm_mul_ps(len12, _mm_set1_ps(difl)), _mm_set1_ps(extmax)); - r12 = _mm_max_ps(or12, pixelsize12); - } - float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12)); - float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12)); - - const __m128 dir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir)); - const __m128 p21_diff = _mm_sub_ps(p2, p1); - const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f)); - const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1); - const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1); -#endif - - float mr = max(r1, r2); - float l = len3(p21_diff); - float invl = 1.0f / l; - float sp_r = mr + 0.5f * l; - - float sphere_b = dot3(dir, sphere_dif2); - float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; - - if(sdisc < 0.0f) - return false; - - /* obtain parameters and test midpoint distance for suitable modes */ -#ifndef __KERNEL_SSE2__ - float3 tg = p21_diff * invl; -#else - const __m128 tg = _mm_mul_ps(p21_diff, _mm_set1_ps(invl)); -#endif - float gd = (r2 - r1) * invl; - - float dirz = dot3(dir, tg); - float difz = dot3(dif, tg); - - float a = 1.0f - (dirz*dirz*(1 + gd*gd)); - - float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1)); - - float tcentre = -halfb/a; - float zcentre = difz + (dirz * tcentre); - - if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) - return false; - if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) - return false; - - /* test minimum separation */ -#ifndef __KERNEL_SSE2__ - float3 cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross(tg, dif)); -#else - const __m128 cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross_zxy(tg, dif)); -#endif - float cprodsq = len3_squared(cprod); - float distscaled = dot3(cprod, dif); - - if(cprodsq == 0) - distscaled = cprod2sq; - else - distscaled = (distscaled*distscaled)/cprodsq; - - if(distscaled > mr*mr) - return false; - - /* calculate true intersection */ -#ifndef __KERNEL_SSE2__ - float3 tdif = dif + tcentre * dir; -#else - const __m128 tdif = fma(_mm_set1_ps(tcentre), dir, dif); -#endif - float tdifz = dot3(tdif, tg); - float tdifma = tdifz*gd + r1; - float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma)); - float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; - float td = tb*tb - 4*a*tc; - - if (td < 0.0f) - return false; - - float rootd = 0.0f; - float correction = 0.0f; - if(flags & CURVE_KN_ACCURATE) { - rootd = sqrtf(td); - correction = ((-tb - rootd)/(2*a)); - } - - float t = tcentre + correction; - - if(t < isect->t) { - - if(flags & CURVE_KN_INTERSECTCORRECTION) { - rootd = sqrtf(td); - correction = ((-tb - rootd)/(2*a)); - t = tcentre + correction; - } - - float z = zcentre + (dirz * correction); - bool backface = false; - - if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { - backface = true; - correction = ((-tb + rootd)/(2*a)); - t = tcentre + correction; - z = zcentre + (dirz * correction); - } - - /* stochastic fade from minimum width */ - float adjradius = or1 + z * (or2 - or1) * invl; - adjradius = adjradius / (r1 + z * gd); - if(lcg_state && adjradius != 1.0f) { - if(lcg_step_float(lcg_state) > adjradius) - return false; - } - /* --- */ - - if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { - - if (flags & CURVE_KN_ENCLOSEFILTER) { - float enc_ratio = 1.01f; - if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { - float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); - float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; - if(a2*c2 < 0.0f) - return false; - } - } - -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -#endif - { - /* record intersection */ - isect->prim = curveAddr; - isect->segment = segment; - isect->object = object; - isect->u = z*invl; - isect->v = td/(4*a*a); - /*isect->v = 1.0f - adjradius;*/ - isect->t = t; - - if(backface) - isect->u = -isect->u; - - return true; - } - } - } - - return false; - -#ifndef __KERNEL_SSE2__ -#undef len3_squared -#undef len3 -#undef dot3 -#endif -} -#endif - -#ifdef __SUBSURFACE__ -/* Special ray intersection routines for subsurface scattering. In that case we - * only want to intersect with primitives in the same object, and if case of - * multiple hits we pick a single random primitive as the intersection point. */ - -ccl_device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array, - float3 P, float3 idir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits) -{ - /* compute and check intersection t-value */ - float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0); - float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1); - float3 dir = 1.0f/idir; - - float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; - float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z); - float t = Oz * invDz; - - if(t > 0.0f && t < tmax) { - /* compute and check barycentric u */ - float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z; - float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z; - float u = Ox + t*Dx; - - if(u >= 0.0f) { - /* compute and check barycentric v */ - float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2); - float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z; - float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z; - float v = Oy + t*Dy; - - if(v >= 0.0f && u + v <= 1.0f) { - (*num_hits)++; - - int hit; - - if(*num_hits <= max_hits) { - hit = *num_hits - 1; - } - else { - /* reservoir sampling: if we are at the maximum number of - * hits, randomly replace element or skip it */ - hit = lcg_step_uint(lcg_state) % *num_hits; - - if(hit >= max_hits) - return; - } - - /* record intersection */ - Intersection *isect = &isect_array[hit]; - isect->prim = triAddr; - isect->object = object; - isect->u = u; - isect->v = v; - isect->t = t; - } - } - } -} -#endif - -/* BVH intersection function variations */ - -#define BVH_INSTANCING 1 -#define BVH_MOTION 2 -#define BVH_HAIR 4 -#define BVH_HAIR_MINIMUM_WIDTH 8 - -#define BVH_FUNCTION_NAME bvh_intersect -#define BVH_FUNCTION_FEATURES 0 -#include "kernel_bvh_traversal.h" - -#if defined(__INSTANCING__) -#define BVH_FUNCTION_NAME bvh_intersect_instancing -#define BVH_FUNCTION_FEATURES BVH_INSTANCING -#include "kernel_bvh_traversal.h" -#endif - -#if defined(__HAIR__) -#define BVH_FUNCTION_NAME bvh_intersect_hair -#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH -#include "kernel_bvh_traversal.h" -#endif - -#if defined(__OBJECT_MOTION__) -#define BVH_FUNCTION_NAME bvh_intersect_motion -#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -#include "kernel_bvh_traversal.h" -#endif - -#if defined(__HAIR__) && defined(__OBJECT_MOTION__) -#define BVH_FUNCTION_NAME bvh_intersect_hair_motion -#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION -#include "kernel_bvh_traversal.h" -#endif - -#if defined(__SUBSURFACE__) -#define BVH_FUNCTION_NAME bvh_intersect_subsurface -#define BVH_FUNCTION_FEATURES 0 -#include "kernel_bvh_subsurface.h" -#endif - -#if defined(__SUBSURFACE__) && defined(__INSTANCING__) -#define BVH_FUNCTION_NAME bvh_intersect_subsurface_instancing -#define BVH_FUNCTION_FEATURES BVH_INSTANCING -#include "kernel_bvh_subsurface.h" -#endif - -#if defined(__SUBSURFACE__) && defined(__HAIR__) -#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair -#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -#include "kernel_bvh_subsurface.h" -#endif - -#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__) -#define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion -#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -#include "kernel_bvh_subsurface.h" -#endif - -#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) -#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair_motion -#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION -#include "kernel_bvh_subsurface.h" -#endif - -/* to work around titan bug when using arrays instead of textures */ -#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) -ccl_device_inline -#else -ccl_device_noinline -#endif -#ifdef __HAIR__ -bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax) -#else -bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect) -#endif -{ -#ifdef __OBJECT_MOTION__ - if(kernel_data.bvh.have_motion) { -#ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax); -#endif /* __HAIR__ */ - - return bvh_intersect_motion(kg, ray, isect, visibility); - } -#endif /* __OBJECT_MOTION__ */ - -#ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax); -#endif /* __HAIR__ */ - -#ifdef __KERNEL_CPU__ - -#ifdef __INSTANCING__ - if(kernel_data.bvh.have_instancing) - return bvh_intersect_instancing(kg, ray, isect, visibility); -#endif /* __INSTANCING__ */ - - return bvh_intersect(kg, ray, isect, visibility); -#else /* __KERNEL_CPU__ */ - -#ifdef __INSTANCING__ - return bvh_intersect_instancing(kg, ray, isect, visibility); -#else - return bvh_intersect(kg, ray, isect, visibility); -#endif /* __INSTANCING__ */ - -#endif /* __KERNEL_CPU__ */ -} - -/* to work around titan bug when using arrays instead of textures */ -#ifdef __SUBSURFACE__ -#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) -ccl_device_inline -#else -ccl_device_noinline -#endif -uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits) -{ -#ifdef __OBJECT_MOTION__ - if(kernel_data.bvh.have_motion) { -#ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_subsurface_hair_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits); -#endif /* __HAIR__ */ - - return bvh_intersect_subsurface_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits); - } -#endif /* __OBJECT_MOTION__ */ - -#ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_subsurface_hair(kg, ray, isect, subsurface_object, lcg_state, max_hits); -#endif /* __HAIR__ */ - -#ifdef __KERNEL_CPU__ - -#ifdef __INSTANCING__ - if(kernel_data.bvh.have_instancing) - return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits); -#endif /* __INSTANCING__ */ - - return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits); -#else /* __KERNEL_CPU__ */ - -#ifdef __INSTANCING__ - return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits); -#else - return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits); -#endif /* __INSTANCING__ */ - -#endif /* __KERNEL_CPU__ */ -} -#endif - -/* Ray offset to avoid self intersection */ - -ccl_device_inline float3 ray_offset(float3 P, float3 Ng) -{ -#ifdef __INTERSECTION_REFINE__ - const float epsilon_f = 1e-5f; - /* ideally this should match epsilon_f, but instancing/mblur - * precision makes it problematic */ - const float epsilon_test = 1.0f; - const int epsilon_i = 32; - - float3 res; - - /* x component */ - if(fabsf(P.x) < epsilon_test) { - res.x = P.x + Ng.x*epsilon_f; - } - else { - uint ix = __float_as_uint(P.x); - ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i; - res.x = __uint_as_float(ix); - } - - /* y component */ - if(fabsf(P.y) < epsilon_test) { - res.y = P.y + Ng.y*epsilon_f; - } - else { - uint iy = __float_as_uint(P.y); - iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i; - res.y = __uint_as_float(iy); - } - - /* z component */ - if(fabsf(P.z) < epsilon_test) { - res.z = P.z + Ng.z*epsilon_f; - } - else { - uint iz = __float_as_uint(P.z); - iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i; - res.z = __uint_as_float(iz); - } - - return res; -#else - const float epsilon_f = 1e-4f; - return P + epsilon_f*Ng; -#endif -} - -/* Refine triangle intersection to more precise hit point. For rays that travel - * far the precision is often not so good, this reintersects the primitive from - * a closer distance. */ - -ccl_device_inline float3 bvh_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray) -{ - float3 P = ray->P; - float3 D = ray->D; - float t = isect->t; - -#ifdef __INTERSECTION_REFINE__ - if(isect->object != ~0) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - D = transform_direction(&tfm, D*t); - D = normalize_len(D, &t); - } - - P = P + D*t; - - float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0); - float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; - float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z); - float rt = Oz * invDz; - - P = P + D*rt; - - if(isect->object != ~0) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - } - - return P; -#else - return P + D*t; -#endif -} - -/* same as above, except that isect->t is assumed to be in object space for instancing */ -ccl_device_inline float3 bvh_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray) -{ - float3 P = ray->P; - float3 D = ray->D; - float t = isect->t; - -#ifdef __INTERSECTION_REFINE__ - if(isect->object != ~0) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - D = transform_direction(&tfm, D); - D = normalize(D); - } - - P = P + D*t; - - float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0); - float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z; - float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z); - float rt = Oz * invDz; - - P = P + D*rt; - - if(isect->object != ~0) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - } - - return P; -#else - return P + D*t; -#endif -} - -#ifdef __HAIR__ - -ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float fc = 0.71f; - float data[4]; - float t2 = t * t; - data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; - data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; - data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; - data[3] = 3.0f * fc * t2 - 2.0f * fc * t; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float data[4]; - float fc = 0.71f; - float t2 = t * t; - float t3 = t2 * t; - data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; - data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; - data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; - data[3] = fc * t3 - fc * t2; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray) -{ - int flag = kernel_data.curve.curveflags; - float t = isect->t; - float3 P = ray->P; - float3 D = ray->D; - - if(isect->object != ~0) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - D = transform_direction(&tfm, D*t); - D = normalize_len(D, &t); - } - - int prim = kernel_tex_fetch(__prim_index, isect->prim); - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + isect->segment; - int k1 = k0 + 1; - - float4 P1 = kernel_tex_fetch(__curve_keys, k0); - float4 P2 = kernel_tex_fetch(__curve_keys, k1); - float l = 1.0f; - float3 tg = normalize_len(float4_to_float3(P2 - P1), &l); - float r1 = P1.w; - float r2 = P2.w; - float gd = ((r2 - r1)/l); - - P = P + D*t; - - if(flag & CURVE_KN_INTERPOLATE) { - int ka = max(k0 - 1,__float_as_int(v00.x)); - int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P0 = kernel_tex_fetch(__curve_keys, ka); - float4 P3 = kernel_tex_fetch(__curve_keys, kb); - - float3 p[4]; - p[0] = float4_to_float3(P0); - p[1] = float4_to_float3(P1); - p[2] = float4_to_float3(P2); - p[3] = float4_to_float3(P3); - -#ifdef __UV__ - sd->u = isect->u; - sd->v = 0.0f; -#endif - - tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); - - if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) - sd->Ng = normalize(-(D - tg * (dot(tg, D)))); - else { - float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); - sd->Ng = normalize(P - p_curr); - sd->Ng = sd->Ng - gd * tg; - sd->Ng = normalize(sd->Ng); - } - sd->N = sd->Ng; - } - else { - float3 dif = P - float4_to_float3(P1); - -#ifdef __UV__ - sd->u = dot(dif,tg)/l; - sd->v = 0.0f; -#endif - - if (flag & CURVE_KN_TRUETANGENTGNORMAL) { - sd->Ng = -(D - tg * dot(tg, D)); - sd->Ng = normalize(sd->Ng); - } - else { - sd->Ng = (dif - tg * sd->u * l) / (P1.w + sd->u * l * gd); - if (gd != 0.0f) { - sd->Ng = sd->Ng - gd * tg ; - sd->Ng = normalize(sd->Ng); - } - } - - sd->N = sd->Ng; - } - -#ifdef __DPDU__ - /* dPdu/dPdv */ - sd->dPdu = tg; - sd->dPdv = cross(tg, sd->Ng); -#endif - - /*add fading parameter for minimum pixel width with transparency bsdf*/ - /*sd->curve_transparency = isect->v;*/ - /*sd->curve_radius = sd->u * gd * l + r1;*/ - - if(isect->object != ~0) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - } - - return P; -} -#endif - -CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_bvh_subsurface.h b/intern/cycles/kernel/kernel_bvh_subsurface.h deleted file mode 100644 index 40683a2da57..00000000000 --- a/intern/cycles/kernel/kernel_bvh_subsurface.h +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Adapted from code Copyright 2009-2010 NVIDIA Corporation, - * and code copyright 2009-2012 Intel Corporation - * - * Modifications Copyright 2011-2013, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function for subsurface scattering, where - * various features can be enabled/disabled. This way we can compile optimized - * versions for each case without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_MOTION: motion blur rendering - * - */ - -#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0) - -ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, - int subsurface_object, uint *lcg_state, int max_hits) -{ - /* todo: - * - test if pushing distance on the stack helps (for non shadow rays) - * - separate version for shadow rays - * - likely and unlikely for if() statements - * - SSE for hair - * - test restrict attribute for pointers - */ - - /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; - - /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; - - /* ray parameters in registers */ - const float tmax = ray->t; - float3 P = ray->P; - float3 idir = bvh_inverse_direction(ray->D); - int object = ~0; - float isect_t = tmax; - - const uint visibility = ~0; - uint num_hits = 0; - -#if FEATURE(BVH_MOTION) - Transform ob_tfm; -#endif - -#if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - - const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); - __m128 Psplat[3], idirsplat[3]; - shuffle_swap_t shufflexyz[3]; - - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); - - __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif - - /* traversal loop */ - do { - do - { - /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) - { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; - -#if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect_t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ -#ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); -#else - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); -#endif - -#else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); - - const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn); - const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); - - /* decide which nodes to traverse next */ -#ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); -#else - traverseChild0 = (_mm_movemask_ps(lrhit) & 1); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2); -#endif -#endif // __KERNEL_SSE2__ - - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); - - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - union { __m128 m128; float v[4]; } uminmax; - uminmax.m128 = tminmax; - bool closestChild1 = uminmax.v[1] < uminmax.v[0]; -#endif - - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; - } - - ++stackPtr; - traversalStack[stackPtr] = nodeAddrChild1; - } - else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; - } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; - } - } - } - - /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1)); - int primAddr = __float_as_int(leaf.x); - -#if FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { -#endif - int primAddr2 = __float_as_int(leaf.y); - - /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; - - /* primitive intersection */ - for(; primAddr < primAddr2; primAddr++) { -#if FEATURE(BVH_HAIR) - uint segment = kernel_tex_fetch(__prim_segment, primAddr); - if(segment != ~0) - continue; -#endif - - /* only primitives from the same object */ - uint tri_object = (object == ~0)? kernel_tex_fetch(__prim_object, primAddr): object; - - if(tri_object == subsurface_object) { - - /* intersect ray against primitive */ - bvh_triangle_intersect_subsurface(kg, isect_array, P, idir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits); - } - } - } -#if FEATURE(BVH_INSTANCING) - else { - /* instance push */ - if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) { - object = subsurface_object; - -#if FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax); -#else - bvh_instance_push(kg, object, ray, &P, &idir, &isect_t, tmax); -#endif - -#if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); - - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif - - ++stackPtr; - traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; - - nodeAddr = kernel_tex_fetch(__object_node, object); - } - else { - /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; - } - } - } -#endif - } while(nodeAddr != ENTRYPOINT_SENTINEL); - -#if FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { - kernel_assert(object != ~0); - - /* instance pop */ -#if FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect_t, &ob_tfm, tmax); -#else - bvh_instance_pop(kg, object, ray, &P, &idir, &isect_t, tmax); -#endif - -#if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); - - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif - - object = ~0; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; - } -#endif - } while(nodeAddr != ENTRYPOINT_SENTINEL); - - return num_hits; -} - -#undef FEATURE -#undef BVH_FUNCTION_NAME -#undef BVH_FUNCTION_FEATURES - diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/kernel_bvh_traversal.h deleted file mode 100644 index 0515a9e0fa7..00000000000 --- a/intern/cycles/kernel/kernel_bvh_traversal.h +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Adapted from code Copyright 2009-2010 NVIDIA Corporation, - * and code copyright 2009-2012 Intel Corporation - * - * Modifications Copyright 2011-2013, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function, where various features can be - * enabled/disabled. This way we can compile optimized versions for each case - * without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_HAIR: hair curve rendering - * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width - * BVH_MOTION: motion blur rendering - * - */ - -#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0) - -ccl_device bool BVH_FUNCTION_NAME -(KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility -#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) -, uint *lcg_state, float difl, float extmax -#endif -) -{ - /* todo: - * - test if pushing distance on the stack helps (for non shadow rays) - * - separate version for shadow rays - * - likely and unlikely for if() statements - * - SSE for hair - * - test restrict attribute for pointers - */ - - /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; - - /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; - - /* ray parameters in registers */ - const float tmax = ray->t; - float3 P = ray->P; - float3 idir = bvh_inverse_direction(ray->D); - int object = ~0; - -#if FEATURE(BVH_MOTION) - Transform ob_tfm; -#endif - - isect->t = tmax; - isect->object = ~0; - isect->prim = ~0; - isect->u = 0.0f; - isect->v = 0.0f; - -#if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - - const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); - __m128 Psplat[3], idirsplat[3]; - shuffle_swap_t shufflexyz[3]; - - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); - - __m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif - - /* traversal loop */ - do { - do - { - /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) - { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; - -#if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect->t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - -#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) - if(difl != 0.0f) { - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } - } -#endif - - /* decide which nodes to traverse next */ -#ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); -#else - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); -#endif - -#else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); - - /* calculate { c0min, c1min, -c0max, -c1max} */ - __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)); - const __m128 tminmax = _mm_xor_ps(minmax, pn); - -#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) - if(difl != 0.0f) { - float4 *tminmaxview = (float4*)&tminmax; - float &c0min = tminmaxview->x, &c1min = tminmaxview->y; - float &c0max = tminmaxview->z, &c1max = tminmaxview->w; - - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } - } -#endif - - const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); - - /* decide which nodes to traverse next */ -#ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); -#else - traverseChild0 = (_mm_movemask_ps(lrhit) & 1); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2); -#endif -#endif // __KERNEL_SSE2__ - - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); - - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - union { __m128 m128; float v[4]; } uminmax; - uminmax.m128 = tminmax; - bool closestChild1 = uminmax.v[1] < uminmax.v[0]; -#endif - - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; - } - - ++stackPtr; - traversalStack[stackPtr] = nodeAddrChild1; - } - else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; - } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; - } - } - } - - /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1)); - int primAddr = __float_as_int(leaf.x); - -#if FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { -#endif - int primAddr2 = __float_as_int(leaf.y); - - /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; - - /* primitive intersection */ - while(primAddr < primAddr2) { - bool hit; - - /* intersect ray against primitive */ -#if FEATURE(BVH_HAIR) - uint segment = kernel_tex_fetch(__prim_segment, primAddr); - if(segment != ~0) { - - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) -#if FEATURE(BVH_HAIR_MINIMUM_WIDTH) - hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax); - else - hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax); -#else - hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment); - else - hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment); -#endif - } - else -#endif - hit = bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr); - - /* shadow ray early termination */ -#if defined(__KERNEL_SSE2__) - if(hit) { - if(visibility == PATH_RAY_SHADOW_OPAQUE) - return true; - - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); - } -#else - if(hit && visibility == PATH_RAY_SHADOW_OPAQUE) - return true; -#endif - - primAddr++; - } - } -#if FEATURE(BVH_INSTANCING) - else { - /* instance push */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); - -#if FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax); -#else - bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax); -#endif - -#if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); - - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif - - ++stackPtr; - traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; - - nodeAddr = kernel_tex_fetch(__object_node, object); - } - } -#endif - } while(nodeAddr != ENTRYPOINT_SENTINEL); - -#if FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { - kernel_assert(object != ~0); - - /* instance pop */ -#if FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax); -#else - bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax); -#endif - -#if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); - - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif - - object = ~0; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; - } -#endif - } while(nodeAddr != ENTRYPOINT_SENTINEL); - - return (isect->prim != ~0); -} - -#undef FEATURE -#undef BVH_FUNCTION_NAME -#undef BVH_FUNCTION_FEATURES - diff --git a/intern/cycles/kernel/kernel_curve.h b/intern/cycles/kernel/kernel_curve.h deleted file mode 100644 index 821ac50eaa9..00000000000 --- a/intern/cycles/kernel/kernel_curve.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License - */ - -CCL_NAMESPACE_BEGIN - -#ifdef __HAIR__ - -/* curve attributes */ - -ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy) -{ - if(elem == ATTR_ELEMENT_CURVE) { -#ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = 0.0f; - if(dy) *dy = 0.0f; -#endif - - return kernel_tex_fetch(__attributes_float, offset + sd->prim); - } - else if(elem == ATTR_ELEMENT_CURVE_KEY) { - float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - int k0 = __float_as_int(curvedata.x) + sd->segment; - int k1 = k0 + 1; - - float f0 = kernel_tex_fetch(__attributes_float, offset + k0); - float f1 = kernel_tex_fetch(__attributes_float, offset + k1); - -#ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*(f1 - f0); - if(dy) *dy = 0.0f; -#endif - - return (1.0f - sd->u)*f0 + sd->u*f1; - } - else { -#ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = 0.0f; - if(dy) *dy = 0.0f; -#endif - - return 0.0f; - } -} - -ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy) -{ - if(elem == ATTR_ELEMENT_CURVE) { - /* idea: we can't derive any useful differentials here, but for tiled - * mipmap image caching it would be useful to avoid reading the highest - * detail level always. maybe a derivative based on the hair density - * could be computed somehow? */ -#ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); - if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); -#endif - - return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim)); - } - else if(elem == ATTR_ELEMENT_CURVE_KEY) { - float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - int k0 = __float_as_int(curvedata.x) + sd->segment; - int k1 = k0 + 1; - - float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0)); - float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1)); - -#ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*(f1 - f0); - if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); -#endif - - return (1.0f - sd->u)*f0 + sd->u*f1; - } - else { -#ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); - if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); -#endif - - return make_float3(0.0f, 0.0f, 0.0f); - } -} - -/* hair info node functions */ - -ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) -{ - float r = 0.0f; - - if(sd->segment != ~0) { - float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - int k0 = __float_as_int(curvedata.x) + sd->segment; - int k1 = k0 + 1; - - float4 P1 = kernel_tex_fetch(__curve_keys, k0); - float4 P2 = kernel_tex_fetch(__curve_keys, k1); - r = (P2.w - P1.w) * sd->u + P1.w; - } - - return r*2.0f; -} - -ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd) -{ - float3 tgN = make_float3(0.0f,0.0f,0.0f); - - if(sd->segment != ~0) { - - tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu))); - tgN = normalize(tgN); - - /* need to find suitable scaled gd for corrected normal */ -#if 0 - tgN = normalize(tgN - gd * sd->dPdu); -#endif - } - - return tgN; -} - -#endif - -CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_object.h b/intern/cycles/kernel/kernel_object.h deleted file mode 100644 index a66277e10cd..00000000000 --- a/intern/cycles/kernel/kernel_object.h +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License - */ - -CCL_NAMESPACE_BEGIN - -enum ObjectTransform { - OBJECT_TRANSFORM = 0, - OBJECT_TRANSFORM_MOTION_PRE = 0, - OBJECT_INVERSE_TRANSFORM = 4, - OBJECT_TRANSFORM_MOTION_POST = 4, - OBJECT_PROPERTIES = 8, - OBJECT_DUPLI = 9 -}; - -enum ObjectVectorTransform { - OBJECT_VECTOR_MOTION_PRE = 0, - OBJECT_VECTOR_MOTION_POST = 3 -}; - -ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object, enum ObjectTransform type) -{ - int offset = object*OBJECT_SIZE + (int)type; - - Transform tfm; - tfm.x = kernel_tex_fetch(__objects, offset + 0); - tfm.y = kernel_tex_fetch(__objects, offset + 1); - tfm.z = kernel_tex_fetch(__objects, offset + 2); - tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); - - return tfm; -} - -ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type) -{ - int offset = object*OBJECT_VECTOR_SIZE + (int)type; - - Transform tfm; - tfm.x = kernel_tex_fetch(__objects_vector, offset + 0); - tfm.y = kernel_tex_fetch(__objects_vector, offset + 1); - tfm.z = kernel_tex_fetch(__objects_vector, offset + 2); - tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); - - return tfm; -} - -#ifdef __OBJECT_MOTION__ -ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int object, float time) -{ - DecompMotionTransform motion; - - int offset = object*OBJECT_SIZE + (int)OBJECT_TRANSFORM_MOTION_PRE; - - motion.mid.x = kernel_tex_fetch(__objects, offset + 0); - motion.mid.y = kernel_tex_fetch(__objects, offset + 1); - motion.mid.z = kernel_tex_fetch(__objects, offset + 2); - motion.mid.w = kernel_tex_fetch(__objects, offset + 3); - - motion.pre_x = kernel_tex_fetch(__objects, offset + 4); - motion.pre_y = kernel_tex_fetch(__objects, offset + 5); - motion.post_x = kernel_tex_fetch(__objects, offset + 6); - motion.post_y = kernel_tex_fetch(__objects, offset + 7); - - Transform tfm; - transform_motion_interpolate(&tfm, &motion, time); - - return tfm; -} - -ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg, int object, float time, Transform *itfm) -{ - int object_flag = kernel_tex_fetch(__object_flag, object); - - if(object_flag & SD_OBJECT_MOTION) { - /* if we do motion blur */ - Transform tfm = object_fetch_transform_motion(kg, object, time); - - if(itfm) - *itfm = transform_quick_inverse(tfm); - - return tfm; - } - else { - Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); - if(itfm) - *itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - - return tfm; - } -} -#endif - -ccl_device_inline void object_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P) -{ -#ifdef __OBJECT_MOTION__ - *P = transform_point(&sd->ob_tfm, *P); -#else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); - *P = transform_point(&tfm, *P); -#endif -} - -ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P) -{ -#ifdef __OBJECT_MOTION__ - *P = transform_point(&sd->ob_itfm, *P); -#else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); - *P = transform_point(&tfm, *P); -#endif -} - -ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N) -{ -#ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed(&sd->ob_tfm, *N)); -#else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); - *N = normalize(transform_direction_transposed(&tfm, *N)); -#endif -} - -ccl_device_inline void object_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N) -{ -#ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed(&sd->ob_itfm, *N)); -#else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); - *N = normalize(transform_direction_transposed(&tfm, *N)); -#endif -} - -ccl_device_inline void object_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D) -{ -#ifdef __OBJECT_MOTION__ - *D = transform_direction(&sd->ob_tfm, *D); -#else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); - *D = transform_direction(&tfm, *D); -#endif -} - -ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D) -{ -#ifdef __OBJECT_MOTION__ - *D = transform_direction(&sd->ob_itfm, *D); -#else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); - *D = transform_direction(&tfm, *D); -#endif -} - -ccl_device_inline float3 object_location(KernelGlobals *kg, ShaderData *sd) -{ - if(sd->object == ~0) - return make_float3(0.0f, 0.0f, 0.0f); - -#ifdef __OBJECT_MOTION__ - return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w); -#else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); - return make_float3(tfm.x.w, tfm.y.w, tfm.z.w); -#endif -} - -ccl_device_inline float object_surface_area(KernelGlobals *kg, int object) -{ - int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; - float4 f = kernel_tex_fetch(__objects, offset); - return f.x; -} - -ccl_device_inline float object_pass_id(KernelGlobals *kg, int object) -{ - if(object == ~0) - return 0.0f; - - int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; - float4 f = kernel_tex_fetch(__objects, offset); - return f.y; -} - -ccl_device_inline float object_random_number(KernelGlobals *kg, int object) -{ - if(object == ~0) - return 0.0f; - - int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; - float4 f = kernel_tex_fetch(__objects, offset); - return f.z; -} - -ccl_device_inline uint object_particle_id(KernelGlobals *kg, int object) -{ - if(object == ~0) - return 0.0f; - - int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; - float4 f = kernel_tex_fetch(__objects, offset); - return __float_as_uint(f.w); -} - -ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object) -{ - if(object == ~0) - return make_float3(0.0f, 0.0f, 0.0f); - - int offset = object*OBJECT_SIZE + OBJECT_DUPLI; - float4 f = kernel_tex_fetch(__objects, offset); - return make_float3(f.x, f.y, f.z); -} - -ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object) -{ - if(object == ~0) - return make_float3(0.0f, 0.0f, 0.0f); - - int offset = object*OBJECT_SIZE + OBJECT_DUPLI; - float4 f = kernel_tex_fetch(__objects, offset + 1); - return make_float3(f.x, f.y, 0.0f); -} - - -ccl_device int shader_pass_id(KernelGlobals *kg, ShaderData *sd) -{ - return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1); -} - -ccl_device_inline float particle_index(KernelGlobals *kg, int particle) -{ - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 0); - return f.x; -} - -ccl_device float particle_age(KernelGlobals *kg, int particle) -{ - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 0); - return f.y; -} - -ccl_device float particle_lifetime(KernelGlobals *kg, int particle) -{ - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 0); - return f.z; -} - -ccl_device float particle_size(KernelGlobals *kg, int particle) -{ - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 0); - return f.w; -} - -ccl_device float4 particle_rotation(KernelGlobals *kg, int particle) -{ - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 1); - return f; -} - -ccl_device float3 particle_location(KernelGlobals *kg, int particle) -{ - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 2); - return make_float3(f.x, f.y, f.z); -} - -ccl_device float3 particle_velocity(KernelGlobals *kg, int particle) -{ - int offset = particle*PARTICLE_SIZE; - float4 f2 = kernel_tex_fetch(__particles, offset + 2); - float4 f3 = kernel_tex_fetch(__particles, offset + 3); - return make_float3(f2.w, f3.x, f3.y); -} - -ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle) -{ - int offset = particle*PARTICLE_SIZE; - float4 f3 = kernel_tex_fetch(__particles, offset + 3); - float4 f4 = kernel_tex_fetch(__particles, offset + 4); - return make_float3(f3.z, f3.w, f4.x); -} - -CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 81b61a54a6a..9b3ddbb7557 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -18,16 +18,15 @@ #include "osl_shader.h" #endif +#include "kernel_random.h" + +#include "geom/geom_bvh.h" + #include "kernel_differential.h" #include "kernel_montecarlo.h" #include "kernel_projection.h" -#include "kernel_object.h" -#include "kernel_triangle.h" -#include "kernel_curve.h" #include "kernel_primitive.h" #include "kernel_projection.h" -#include "kernel_random.h" -#include "kernel_bvh.h" #include "kernel_accumulate.h" #include "kernel_camera.h" #include "kernel_shader.h" diff --git a/intern/cycles/kernel/kernel_triangle.h b/intern/cycles/kernel/kernel_triangle.h deleted file mode 100644 index 0455df85961..00000000000 --- a/intern/cycles/kernel/kernel_triangle.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License - */ - -CCL_NAMESPACE_BEGIN - -/* Point on triangle for Moller-Trumbore triangles */ -ccl_device_inline float3 triangle_point_MT(KernelGlobals *kg, int tri_index, float u, float v) -{ - /* load triangle vertices */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index)); - - float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); - - /* compute point */ - float t = 1.0f - u - v; - return (u*v0 + v*v1 + t*v2); -} - -/* Normal for Moller-Trumbore triangles */ -ccl_device_inline float3 triangle_normal_MT(KernelGlobals *kg, int tri_index, int *shader) -{ -#if 0 - /* load triangle vertices */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index)); - - float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); - - /* compute normal */ - return normalize(cross(v2 - v0, v1 - v0)); -#else - float4 Nm = kernel_tex_fetch(__tri_normal, tri_index); - *shader = __float_as_int(Nm.w); - return make_float3(Nm.x, Nm.y, Nm.z); -#endif -} - -/* Return 3 triangle vertex locations */ -ccl_device_inline void triangle_vertices(KernelGlobals *kg, int tri_index, float3 P[3]) -{ - /* load triangle vertices */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index)); - - P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); -} - -ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int tri_index, float u, float v) -{ - /* load triangle vertices */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index)); - - float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x))); - float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y))); - float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z))); - - return normalize((1.0f - u - v)*n2 + u*n0 + v*n1); -} - -ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, float3 *dPdu, float3 *dPdv, int tri) -{ - /* fetch triangle vertex coordinates */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri)); - - float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); - - /* compute derivatives of P w.r.t. uv */ - *dPdu = (p0 - p2); - *dPdv = (p1 - p2); -} - -/* attributes */ - -ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy) -{ - if(elem == ATTR_ELEMENT_FACE) { - if(dx) *dx = 0.0f; - if(dy) *dy = 0.0f; - - return kernel_tex_fetch(__attributes_float, offset + sd->prim); - } - else if(elem == ATTR_ELEMENT_VERTEX) { - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim)); - - float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x)); - float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y)); - float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z)); - -#ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; -#endif - - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; - } - else if(elem == ATTR_ELEMENT_CORNER) { - int tri = offset + sd->prim*3; - float f0 = kernel_tex_fetch(__attributes_float, tri + 0); - float f1 = kernel_tex_fetch(__attributes_float, tri + 1); - float f2 = kernel_tex_fetch(__attributes_float, tri + 2); - -#ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; -#endif - - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; - } - else { - if(dx) *dx = 0.0f; - if(dy) *dy = 0.0f; - - return 0.0f; - } -} - -ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy) -{ - if(elem == ATTR_ELEMENT_FACE) { - if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); - if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - - return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim)); - } - else if(elem == ATTR_ELEMENT_VERTEX) { - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim)); - - float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); - float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); - float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z))); - -#ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; -#endif - - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; - } - else if(elem == ATTR_ELEMENT_CORNER) { - int tri = offset + sd->prim*3; - float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0)); - float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1)); - float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2)); - -#ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; -#endif - - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; - } - else { - if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); - if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - - return make_float3(0.0f, 0.0f, 0.0f); - } -} - -CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 58858c3766e..a25d2fe03b5 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -30,14 +30,13 @@ #include "kernel_compat_cpu.h" #include "kernel_globals.h" +#include "kernel_random.h" + +#include "geom/geom_bvh.h" + #include "kernel_montecarlo.h" #include "kernel_projection.h" #include "kernel_differential.h" -#include "kernel_object.h" -#include "kernel_random.h" -#include "kernel_bvh.h" -#include "kernel_triangle.h" -#include "kernel_curve.h" #include "kernel_primitive.h" #include "kernel_projection.h" #include "kernel_accumulate.h" diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 554f647df7c..34d9ebefdb3 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -18,7 +18,8 @@ #include "kernel_montecarlo.h" #include "kernel_types.h" #include "kernel_globals.h" -#include "kernel_object.h" + +#include "geom/geom_object.h" #include "closure/bsdf_diffuse.h" #include "closure/bssrdf.h" -- cgit v1.2.3