diff options
author | Sergey Sharybin <sergey.vfx@gmail.com> | 2016-08-01 16:40:46 +0300 |
---|---|---|
committer | Sergey Sharybin <sergey.vfx@gmail.com> | 2016-08-01 16:54:29 +0300 |
commit | 6353ecb996898b4ce2fe8065130ed1f5ea3b6989 (patch) | |
tree | b6d620152e4ff7920465d8396fe443dc9b3ffc56 /intern/cycles/kernel/closure | |
parent | 7065022f7aa23ba13d2999e1e40162a8f480af0e (diff) |
Cycles: Tweaks to support CUDA 8 toolkit
All the changes are mainly giving explicit tips on inlining functions,
so they match how inlining worked with previous toolkit.
This make kernel compiled by CUDA 8 render in average with same speed
as previous kernels. Some scenes are somewhat faster, some of them are
somewhat slower. But slowdown is within 1% so far.
On a positive side it allows us to enable newer generation cards on
buildbots (so GTX 10x0 will be officially supported soon).
Diffstat (limited to 'intern/cycles/kernel/closure')
-rw-r--r-- | intern/cycles/kernel/closure/bsdf.h | 10 | ||||
-rw-r--r-- | intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h | 6 | ||||
-rw-r--r-- | intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h | 13 | ||||
-rw-r--r-- | intern/cycles/kernel/closure/bssrdf.h | 6 |
4 files changed, 27 insertions, 8 deletions
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index a251e3bdcf9..55bdf3ecbb4 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -36,7 +36,15 @@ CCL_NAMESPACE_BEGIN -ccl_device int bsdf_sample(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float randu, float randv, float3 *eval, float3 *omega_in, differential3 *domega_in, float *pdf) +ccl_device_inline int bsdf_sample(KernelGlobals *kg, + ShaderData *sd, + const ShaderClosure *sc, + float randu, + float randv, + float3 *eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) { int label; diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h index 8ed76bea525..9929246ae5c 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h @@ -62,7 +62,11 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough return 2.0f / (roughness*roughness) - 2.0f; } -ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) +ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect( + const ShaderClosure *sc, + const float3 I, + const float3 omega_in, + float *pdf) { const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc; float3 N = bsdf->N; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index afd4a8da62a..6ebe2f6a751 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -25,11 +25,18 @@ * energy is used. In combination with MIS, that is enough to produce an unbiased result, although * the balance heuristic isn't necessarily optimal anymore. */ -ccl_device float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi, float3 wo, const bool wo_outside, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint* lcg_state +ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)( + float3 wi, + float3 wo, + const bool wo_outside, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state #ifdef MF_MULTI_GLASS - , const float eta + , const float eta #elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k + , float3 *n, float3 *k #endif ) { diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index a260ae9a31b..35c95768b69 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -141,7 +141,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r) } /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */ -ccl_device float bssrdf_cubic_quintic_root_find(float xi) +ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi) { /* newton-raphson iteration, usually succeeds in 2-4 iterations, except * outside 0.02 ... 0.98 where it can go up to 10, so overall performance @@ -255,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r) * Returns scaled radius, meaning the result is to be scaled up by d. * Since there's no closed form solution we do Newton-Raphson method to find it. */ -ccl_device float bssrdf_burley_root_find(float xi) +ccl_device_inline float bssrdf_burley_root_find(float xi) { const float tolerance = 1e-6f; const int max_iteration_count = 10; @@ -389,7 +389,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float bssrdf_burley_sample(sc, xi, r, h); } -ccl_device float bssrdf_pdf(const ShaderClosure *sc, float r) +ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r) { if(sc->type == CLOSURE_BSSRDF_CUBIC_ID) return bssrdf_cubic_pdf(sc, r); |