diff options
author | Dalai Felinto <dfelinto@gmail.com> | 2016-10-03 23:54:22 +0300 |
---|---|---|
committer | Dalai Felinto <dfelinto@gmail.com> | 2016-10-03 23:54:22 +0300 |
commit | ae44e24fed4d7e83f750af2b62384a039ddb54f2 (patch) | |
tree | 5054e908a187ae1c6e2c5d419eaa9ef09ece22c3 | |
parent | 55aadccbde3a44f9e9a7046f1478e31e2475f60c (diff) | |
parent | b4f9766ed13b00fbcdbd7c6e051f993caa95828c (diff) |
Merge remote-tracking branch 'origin/master' into blender2.8
24 files changed, 97 insertions, 118 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index e061fab6b52..9331418e94e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -404,7 +404,7 @@ option(WITH_CYCLES_STANDALONE_GUI "Build Cycles standalone with GUI" OFF) option(WITH_CYCLES_OSL "Build Cycles with OSL support" ${_init_CYCLES_OSL}) option(WITH_CYCLES_OPENSUBDIV "Build Cycles with OpenSubdiv support" ${_init_CYCLES_OPENSUBDIV}) option(WITH_CYCLES_CUDA_BINARIES "Build Cycles CUDA binaries" OFF) -set(CYCLES_CUDA_BINARIES_ARCH sm_20 sm_21 sm_30 sm_35 sm_37 sm_50 sm_52 CACHE STRING "CUDA architectures to build binaries for") +set(CYCLES_CUDA_BINARIES_ARCH sm_20 sm_21 sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 CACHE STRING "CUDA architectures to build binaries for") mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH) unset(PLATFORM_DEFAULT) option(WITH_CYCLES_LOGGING "Build Cycles with logging support" ON) diff --git a/build_files/buildbot/slave_compile.py b/build_files/buildbot/slave_compile.py index b156e4be327..c2bfd882fde 100644 --- a/build_files/buildbot/slave_compile.py +++ b/build_files/buildbot/slave_compile.py @@ -72,8 +72,7 @@ if 'cmake' in builder: # Set up OSX architecture if builder.endswith('x86_64_10_6_cmake'): cmake_extra_options.append('-DCMAKE_OSX_ARCHITECTURES:STRING=x86_64') - cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-hack/bin/nvcc') - cmake_extra_options.append('-DCUDA_NVCC8_EXECUTABLE=/usr/local/cuda8-hack/bin/nvcc') + cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda8-hack/bin/nvcc') cmake_extra_options.append('-DWITH_CODEC_QUICKTIME=OFF') cmake_extra_options.append('-DCMAKE_OSX_DEPLOYMENT_TARGET=10.6') build_cubins = False @@ -94,8 +93,7 @@ if 'cmake' in builder: elif builder.startswith('win32'): bits = 32 cmake_options.extend(['-G', 'Visual Studio 12 2013']) - cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v7.5/bin/nvcc.exe') - cmake_extra_options.append('-DCUDA_NVCC8_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/nvcc.exe') + cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/nvcc.exe') elif builder.startswith('linux'): tokens = builder.split("_") @@ -115,8 +113,7 @@ if 'cmake' in builder: cuda_chroot_name = 'buildbot_' + deb_name + '_x86_64' targets = ['player', 'blender', 'cuda'] - cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-7.5/bin/nvcc') - cmake_extra_options.append('-DCUDA_NVCC8_EXECUTABLE=/usr/local/cuda-8.0/bin/nvcc') + cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-8.0/bin/nvcc') cmake_options.append("-C" + os.path.join(blender_dir, cmake_config_file)) diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 06089b4014a..e4341c8aca1 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -215,11 +215,11 @@ if(WITH_CYCLES_CUDA_BINARIES) set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}") # warn for other versions - if(CUDA_VERSION MATCHES "75") + if(CUDA_VERSION MATCHES "80") else() message(WARNING "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, " - "build may succeed but only CUDA 7.5 is officially supported") + "build may succeed but only CUDA 8.0 is officially supported") endif() # build for each arch @@ -251,11 +251,6 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_nvcc_command ${CUDA_NVCC_EXECUTABLE}) set(cuda_nvcc_version ${CUDA_VERSION}) - if(DEFINED CUDA_NVCC8_EXECUTABLE AND ((${arch} STREQUAL "sm_60") OR (${arch} STREQUAL "sm_61"))) - set(cuda_nvcc_command ${CUDA_NVCC8_EXECUTABLE}) - set(cuda_nvcc_version "80") - endif() - set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}") set(cuda_math_flags "--use_fast_math") diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 7cecee793c1..36798982653 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -157,8 +157,9 @@ CCL_NAMESPACE_BEGIN #undef BVH_NAME_EVAL #undef BVH_FUNCTION_FULL_NAME +/* Note: ray is passed by value to work around a possible CUDA compiler bug. */ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, - const Ray *ray, + const Ray ray, const uint visibility, Intersection *isect, uint *lcg_state, @@ -169,32 +170,32 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, if(kernel_data.bvh.have_motion) { # ifdef __HAIR__ if(kernel_data.bvh.have_curves) - return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax); + return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax); # endif /* __HAIR__ */ - return bvh_intersect_motion(kg, ray, isect, visibility); + return bvh_intersect_motion(kg, &ray, isect, visibility); } #endif /* __OBJECT_MOTION__ */ #ifdef __HAIR__ if(kernel_data.bvh.have_curves) - return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax); + return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax); #endif /* __HAIR__ */ #ifdef __KERNEL_CPU__ # ifdef __INSTANCING__ if(kernel_data.bvh.have_instancing) - return bvh_intersect_instancing(kg, ray, isect, visibility); + return bvh_intersect_instancing(kg, &ray, isect, visibility); # endif /* __INSTANCING__ */ - return bvh_intersect(kg, ray, isect, visibility); + return bvh_intersect(kg, &ray, isect, visibility); #else /* __KERNEL_CPU__ */ # ifdef __INSTANCING__ - return bvh_intersect_instancing(kg, ray, isect, visibility); + return bvh_intersect_instancing(kg, &ray, isect, visibility); # else - return bvh_intersect(kg, ray, isect, visibility); + return bvh_intersect(kg, &ray, isect, visibility); # endif /* __INSTANCING__ */ #endif /* __KERNEL_CPU__ */ diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index db2275b0ff8..726bef1794c 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -16,7 +16,7 @@ // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and // 3-vector which might be faster. -ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg, +ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg, int node_addr, int child) { @@ -30,7 +30,7 @@ ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg, } #if !defined(__KERNEL_SSE2__) -ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg, +ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, const float3 P, const float3 idir, const float t, @@ -77,7 +77,7 @@ ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg, #endif } -ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, +ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, const float3 P, const float3 idir, const float t, @@ -139,7 +139,7 @@ ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, #endif } -ccl_device_inline bool bvh_unaligned_node_intersect_child( +ccl_device_forceinline bool bvh_unaligned_node_intersect_child( KernelGlobals *kg, const float3 P, const float3 dir, @@ -166,7 +166,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child( return tnear <= tfar; } -ccl_device_inline bool bvh_unaligned_node_intersect_child_robust( +ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust( KernelGlobals *kg, const float3 P, const float3 dir, @@ -202,7 +202,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child_robust( } } -ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg, +ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, const float3 P, const float3 dir, const float3 idir, @@ -232,7 +232,7 @@ ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg, return mask; } -ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, +ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, const float3 P, const float3 dir, const float3 idir, @@ -264,7 +264,7 @@ ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, return mask; } -ccl_device_inline int bvh_node_intersect(KernelGlobals *kg, +ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, const float3 P, const float3 dir, const float3 idir, @@ -295,7 +295,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg, } } -ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg, +ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, const float3 P, const float3 dir, const float3 idir, @@ -333,7 +333,7 @@ ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg, } #else /* !defined(__KERNEL_SSE2__) */ -int ccl_device_inline bvh_aligned_node_intersect( +int ccl_device_forceinline bvh_aligned_node_intersect( KernelGlobals *kg, const float3& P, const float3& dir, @@ -377,7 +377,7 @@ int ccl_device_inline bvh_aligned_node_intersect( # endif } -int ccl_device_inline bvh_aligned_node_intersect_robust( +ccl_device_forceinline int bvh_aligned_node_intersect_robust( KernelGlobals *kg, const float3& P, const float3& dir, @@ -441,7 +441,7 @@ int ccl_device_inline bvh_aligned_node_intersect_robust( # endif } -int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg, +ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, const float3 P, const float3 dir, const ssef& isect_near, @@ -502,7 +502,7 @@ int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg, # endif } -int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg, +ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, const float3 P, const float3 dir, const ssef& isect_near, @@ -573,7 +573,7 @@ int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg, # endif } -ccl_device_inline int bvh_node_intersect(KernelGlobals *kg, +ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, const float3& P, const float3& dir, const ssef& isect_near, @@ -611,7 +611,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg, } } -ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg, +ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, const float3& P, const float3& dir, const ssef& isect_near, diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index b1a52968a26..a0e478e972b 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -40,21 +40,16 @@ * */ -#ifndef __KERNEL_GPU__ -ccl_device -#else -ccl_device_inline -#endif -bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility +ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , uint *lcg_state, - float difl, - float extmax + , uint *lcg_state, + float difl, + float extmax #endif - ) + ) { /* todo: * - test if pushing distance on the stack helps (for non shadow rays) diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h index 27729046f8d..c3abe2e157d 100644 --- a/intern/cycles/kernel/bvh/bvh_types.h +++ b/intern/cycles/kernel/bvh/bvh_types.h @@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN /* Don't inline intersect functions on GPU, this is faster */ #ifdef __KERNEL_GPU__ -# define ccl_device_intersect ccl_device_noinline +# define ccl_device_intersect ccl_device_forceinline #else # define ccl_device_intersect ccl_device_inline #endif diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 2ab92badc93..7e4d5fe2e37 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -36,7 +36,7 @@ CCL_NAMESPACE_BEGIN -ccl_device_inline int bsdf_sample(KernelGlobals *kg, +ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float randu, @@ -147,7 +147,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, #ifndef __KERNEL_CUDA__ ccl_device #else -ccl_device_inline +ccl_device_forceinline #endif float3 bsdf_eval(KernelGlobals *kg, ShaderData *sd, diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h index 9929246ae5c..1cd8246aa71 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h @@ -62,7 +62,7 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough return 2.0f / (roughness*roughness) - 2.0f; } -ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect( +ccl_device_forceinline float3 bsdf_ashikhmin_shirley_eval_reflect( const ShaderClosure *sc, const float3 I, const float3 omega_in, diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 7c36f05b6cc..0a8d14a00c2 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -183,7 +183,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes( *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x)*(*slope_x)); } -ccl_device_inline float3 microfacet_sample_stretched( +ccl_device_forceinline float3 microfacet_sample_stretched( KernelGlobals *kg, const float3 omega_i, const float alpha_x, const float alpha_y, const float randu, const float randv, diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index 0a6dd4dcbdf..cea59adfebe 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN /* === GGX Microfacet distribution functions === */ /* Isotropic GGX microfacet distribution */ -ccl_device_inline float D_ggx(float3 wm, float alpha) +ccl_device_forceinline float D_ggx(float3 wm, float alpha) { wm.z *= wm.z; alpha *= alpha; @@ -30,7 +30,7 @@ ccl_device_inline float D_ggx(float3 wm, float alpha) } /* Anisotropic GGX microfacet distribution */ -ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha) +ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha) { float slope_x = -wm.x/alpha.x; float slope_y = -wm.y/alpha.y; @@ -40,7 +40,7 @@ ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha) } /* Sample slope distribution (based on page 14 of the supplemental implementation). */ -ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU) +ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU) { if(cosI > 0.9999f || cosI < 1e-6f) { const float r = sqrtf(randU.x / (1.0f - randU.x)); @@ -78,7 +78,7 @@ ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU) } /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */ -ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU) +ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU) { const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z)); const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU); @@ -94,7 +94,7 @@ ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, con /* === Phase functions: Glossy, Diffuse and Glass === */ /* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */ -ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm) +ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm) { if(n && k) *weight *= fresnel_conductor(dot(wi, wm), *n, *k); @@ -102,7 +102,7 @@ ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, floa return -wi + 2.0f * wm * dot(wi, wm); } -ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k) +ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k) { if(w.z > 0.9999f) return make_float3(0.0f, 0.0f, 0.0f); @@ -132,7 +132,7 @@ ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda } /* Phase function for rough lambertian diffuse surfaces. */ -ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv) +ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv) { float3 tm, bm; make_orthonormals(wm, &tm, &bm); @@ -141,14 +141,14 @@ ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float ra return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm; } -ccl_device_inline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm) +ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm) { const float v = max(0.0f, dot(w, wm)) * M_1_PI_F; return make_float3(v, v, v); } /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */ -ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside) +ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside) { float cosI = dot(wi, wm); float f = fresnel_dielectric_cos(cosI, eta); @@ -162,7 +162,7 @@ ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta, return normalize(wm*(cosI*inv_eta + cosT) - wi*inv_eta); } -ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta) +ccl_device_forceinline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta) { if(w.z > 0.9999f) return make_float3(0.0f, 0.0f, 0.0f); @@ -195,7 +195,7 @@ ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda, /* === Utility functions for the random walks === */ /* Smith Lambda function for GGX (based on page 12 of the supplemental implementation). */ -ccl_device_inline float mf_lambda(const float3 w, const float2 alpha) +ccl_device_forceinline float mf_lambda(const float3 w, const float2 alpha) { if(w.z > 0.9999f) return 0.0f; @@ -212,18 +212,18 @@ ccl_device_inline float mf_lambda(const float3 w, const float2 alpha) } /* Height distribution CDF (based on page 4 of the supplemental implementation). */ -ccl_device_inline float mf_invC1(const float h) +ccl_device_forceinline float mf_invC1(const float h) { return 2.0f * saturate(h) - 1.0f; } -ccl_device_inline float mf_C1(const float h) +ccl_device_forceinline float mf_C1(const float h) { return saturate(0.5f * (h + 1.0f)); } /* Masking function (based on page 16 of the supplemental implementation). */ -ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda) +ccl_device_forceinline float mf_G1(const float3 w, const float C1, const float lambda) { if(w.z > 0.9999f) return 1.0f; @@ -233,7 +233,7 @@ ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda } /* Sampling from the visible height distribution (based on page 17 of the supplemental implementation). */ -ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U) +ccl_device_forceinline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U) { if(w.z > 0.9999f) return false; @@ -262,14 +262,14 @@ ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, flo /* Approximation for the albedo of the single-scattering GGX distribution, * the missing energy is then approximated as a diffuse reflection for the PDF. */ -ccl_device_inline float mf_ggx_albedo(float r) +ccl_device_forceinline float mf_ggx_albedo(float r) { float albedo = 0.806495f*expf(-1.98712f*r*r) + 0.199531f; albedo -= ((((((1.76741f*r - 8.43891f)*r + 15.784f)*r - 14.398f)*r + 6.45221f)*r - 1.19722f)*r + 0.027803f)*r + 0.00568739f; return saturate(albedo); } -ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha) +ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha) { float D = D_ggx(normalize(wi+wo), alpha); float lambda = mf_lambda(wi, make_float2(alpha, alpha)); @@ -277,17 +277,17 @@ ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z; } -ccl_device_inline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha) +ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha) { return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z; } -ccl_device_inline float mf_diffuse_pdf(const float3 wo) +ccl_device_forceinline float mf_diffuse_pdf(const float3 wo) { return M_1_PI_F * wo.z; } -ccl_device_inline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta) +ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta) { float3 wh; float fresnel; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index 6ebe2f6a751..8054fa8e849 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -25,7 +25,7 @@ * energy is used. In combination with MIS, that is enough to produce an unbiased result, although * the balance heuristic isn't necessarily optimal anymore. */ -ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)( +ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( float3 wi, float3 wo, const bool wo_outside, @@ -168,7 +168,7 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)( * escaped the surface in wo. The function returns the throughput between wi and wo. * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal. */ -ccl_device float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state +ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state #ifdef MF_MULTI_GLASS , const float eta #elif defined(MF_MULTI_GLOSSY) diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index 35c95768b69..af0bbd861a9 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -141,7 +141,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r) } /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */ -ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi) +ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi) { /* newton-raphson iteration, usually succeeds in 2-4 iterations, except * outside 0.02 ... 0.98 where it can go up to 10, so overall performance @@ -255,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r) * Returns scaled radius, meaning the result is to be scaled up by d. * Since there's no closed form solution we do Newton-Raphson method to find it. */ -ccl_device_inline float bssrdf_burley_root_find(float xi) +ccl_device_forceinline float bssrdf_burley_root_find(float xi) { const float tolerance = 1e-6f; const int max_iteration_count = 10; @@ -389,7 +389,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float bssrdf_burley_sample(sc, xi, r, h); } -ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r) +ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r) { if(sc->type == CLOSURE_BSSRDF_CUBIC_ID) return bssrdf_cubic_pdf(sc, r); diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index aa9cd295452..84aaaab7453 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -222,10 +222,10 @@ ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) #ifdef __KERNEL_SSE2__ /* Pass P and dir by reference to aligned vector */ -ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, +ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) #else -ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, +ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) #endif { @@ -621,7 +621,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect return hit; } -ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, +ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) { /* define few macros to minimize code duplication for SSE */ diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index dd5752f4c91..623c1dcaaa1 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -54,13 +54,7 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v } } -/* TODO(sergey): This is just a workaround for annoying 6.5 compiler bug. */ -#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ < 500 -ccl_device_inline -#else -ccl_device_noinline -#endif -void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value) +ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value) { #ifdef __PASSES__ if(eval->use_light_pass) { diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 9a96cb9f438..e0c7b17c6a0 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -37,6 +37,7 @@ /* Qualifier wrappers for different names on different devices */ #define ccl_device __device__ __inline__ +# define ccl_device_forceinline __device__ __forceinline__ #if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500) # define ccl_device_inline __device__ __forceinline__ #else diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 2ae89dde7c4..f076e3a7d37 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -33,6 +33,7 @@ /* in opencl all functions are device functions, so leave this empty */ #define ccl_device #define ccl_device_inline ccl_device +#define ccl_device_forceinline ccl_device #define ccl_device_noinline ccl_device ccl_noinline #define ccl_may_alias #define ccl_constant __constant diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 903be4f09a0..7558fb94478 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -69,7 +69,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, Intersection isect; uint visibility = path_state_ray_visibility(kg, state); bool hit = scene_intersect(kg, - ray, + *ray, visibility, &isect, NULL, @@ -655,9 +655,9 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, lcg_state = lcg_state_init(rng, &state, 0x51633e2d); } - bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax); + bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); #else - bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); + bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); #endif #ifdef __KERNEL_DEBUG__ diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index e38c1a01f6b..cdb07db587a 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -282,9 +282,9 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in lcg_state = lcg_state_init(rng, &state, 0x51633e2d); } - bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax); + bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); #else - bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); + bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); #endif #ifdef __KERNEL_DEBUG__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index b534d9950c5..4a76ffddbe7 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons return index; } -ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension) +ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { @@ -132,13 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, #endif } -/* Temporary workaround for Pascal cards, otherwise AA does not work properly. */ -#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ >= 600 -__device__ __forceinline__ -#else -ccl_device_inline -#endif -void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { @@ -199,7 +193,7 @@ ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG /* Linear Congruential Generator */ -ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension) +ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension) { /* implicit mod 2^32 */ rng = (1103515245*(rng) + 12345); diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index 95b57404a77..2981f6ac566 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -155,7 +155,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, } else { Intersection isect; - blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); + blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); } #ifdef __VOLUME__ @@ -205,7 +205,7 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg, Intersection *isect = &isect_object; #endif - bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f); + bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f); #ifdef __TRANSPARENT_SHADOWS__ if(blocked && kernel_data.integrator.transparent_shadows) { @@ -221,7 +221,7 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg, if(bounce >= kernel_data.integrator.transparent_max_bounce) return true; - if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f)) + if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f)) { #ifdef __VOLUME__ /* attenuation for last line segment towards light */ diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index f61a9ec0fb1..0f3edcb7eaa 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -1153,7 +1153,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg, tracedata->sd.osl_globals = sd->osl_globals; /* raytrace */ - return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f); + return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f); } diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 257c6ad7491..6af65f88a02 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -42,6 +42,7 @@ #if defined(_WIN32) && !defined(FREE_WINDOWS) #define ccl_device_inline static __forceinline +#define ccl_device_forceinline static __forceinline #define ccl_align(...) __declspec(align(__VA_ARGS__)) #ifdef __KERNEL_64_BIT__ #define ccl_try_align(...) __declspec(align(__VA_ARGS__)) @@ -56,6 +57,7 @@ #else #define ccl_device_inline static inline __attribute__((always_inline)) +#define ccl_device_forceinline static inline __attribute__((always_inline)) #define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) #ifndef FREE_WINDOWS64 #define __forceinline inline __attribute__((always_inline)) diff --git a/intern/elbeem/intern/solver_main.cpp b/intern/elbeem/intern/solver_main.cpp index a338bb77b4c..68f7c04cd54 100644 --- a/intern/elbeem/intern/solver_main.cpp +++ b/intern/elbeem/intern/solver_main.cpp @@ -376,11 +376,11 @@ LbmFsgrSolver::mainLoop(const int lev) // main loop region const bool doReduce = true; const int gridLoopBound=1; - const int gDebugLevel = ::gDebugLevel; int calcNumInvIfCells = 0; LbmFloat calcInitialMass = 0; GRID_REGION_INIT(); #if PARALLEL==1 + const int gDebugLevel = ::gDebugLevel; #pragma omp parallel default(none) num_threads(mNumOMPThreads) \ reduction(+: \ calcCurrentMass,calcCurrentVolume, \ @@ -868,10 +868,8 @@ LbmFsgrSolver::mainLoop(const int lev) // physical drop model if(mPartUsePhysModel) { LbmFloat realWorldFac = (mLevel[lev].simCellSize / mLevel[lev].timestep); - LbmFloat rux = (ux * realWorldFac); - LbmFloat ruy = (uy * realWorldFac); - LbmFloat ruz = (uz * realWorldFac); - LbmFloat rl = norm(ntlVec3Gfx(rux,ruy,ruz)); + LbmVec ru(ux * realWorldFac, uy * realWorldFac, uz * realWorldFac); + LbmFloat rl = norm(ru); basethresh *= rl; // reduce probability in outer region? @@ -963,14 +961,15 @@ LbmFsgrSolver::mainLoop(const int lev) // average normal & velocity // -> mostly along velocity dir, many into surface // fluid velocity (not normalized!) - LbmVec flvelVel = LbmVec(ux,uy,uz); + LbmVec flvelVel(ux,uy,uz); LbmFloat flvelLen = norm(flvelVel); // surface normal - LbmVec normVel = LbmVec(surfaceNormal[0],surfaceNormal[1],surfaceNormal[2]); + LbmVec normVel(surfaceNormal[0],surfaceNormal[1],surfaceNormal[2]); normalize(normVel); LbmFloat normScale = (0.01+flvelLen); // jitter vector, 0.2 * flvel - LbmVec jittVel = LbmVec(jx,jy,jz)*(0.05+flvelLen)*0.1; + LbmVec jittVel(jx,jy,jz); + jittVel *= (0.05+flvelLen)*0.1; // weighten velocities const LbmFloat flvelWeight = 0.9; LbmVec newpartVel = normVel*normScale*(1.-flvelWeight) + flvelVel*(flvelWeight) + jittVel; @@ -1120,13 +1119,13 @@ LbmFsgrSolver::preinitGrids() const int lev = mMaxRefine; const bool doReduce = false; const int gridLoopBound=0; - const int gDebugLevel = ::gDebugLevel; // preinit both grids for(int s=0; s<2; s++) { GRID_REGION_INIT(); #if PARALLEL==1 + const int gDebugLevel = ::gDebugLevel; #pragma omp parallel default(none) num_threads(mNumOMPThreads) \ reduction(+: \ calcCurrentMass,calcCurrentVolume, \ @@ -1161,10 +1160,10 @@ LbmFsgrSolver::standingFluidPreinit() const int lev = mMaxRefine; const bool doReduce = false; const int gridLoopBound=1; - const int gDebugLevel = ::gDebugLevel; GRID_REGION_INIT(); #if PARALLEL==1 + const int gDebugLevel = ::gDebugLevel; #pragma omp parallel default(none) num_threads(mNumOMPThreads) \ reduction(+: \ calcCurrentMass,calcCurrentVolume, \ |