Fix Cycles CUDA performance on CUDA 8.0.

Mostly this is making inlining match CUDA 7.5 in a few performance critical places. The end result is that performance is now better than before, possibly due to less register spilling or other CUDA 8.0 compiler improvements. On benchmarks scenes, there are 3% to 35% render time reductions. Stack memory usage is reduced a little too. Reviewed By: sergey Differential Revision: https://developer.blender.org/D2269
author: Brecht Van Lommel <brechtvanlommel@gmail.com> 2016-10-02 15:48:39 +0300
committer: Brecht Van Lommel <brechtvanlommel@gmail.com> 2016-10-03 23:15:25 +0300
commit: a3abb020e37a072eb71fd30de9ab125d1c16623a (patch)
tree: b525be7f8a0792eedecb2b95802ede88dc3f330e /intern/cycles/kernel/closure
parent: 49ad4215baf16d850d0e367f003ab688e4a3d08e (diff)
6 files changed, 29 insertions, 29 deletions
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 2ab92badc93..7e4d5fe2e37 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -36,7 +36,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
                                   ShaderData *sd,
                                   const ShaderClosure *sc,
                                   float randu,
@@ -147,7 +147,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
 #ifndef __KERNEL_CUDA__
 ccl_device
 #else
-ccl_device_inline
+ccl_device_forceinline
 #endif
 float3 bsdf_eval(KernelGlobals *kg,
                  ShaderData *sd,
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 9929246ae5c..1cd8246aa71 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -62,7 +62,7 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough
 	return 2.0f / (roughness*roughness) - 2.0f;
 }
 
-ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect(
+ccl_device_forceinline float3 bsdf_ashikhmin_shirley_eval_reflect(
         const ShaderClosure *sc,
         const float3 I,
         const float3 omega_in,
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 7c36f05b6cc..0a8d14a00c2 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -183,7 +183,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(
 	*slope_y = S * z * safe_sqrtf(1.0f + (*slope_x)*(*slope_x));
 }
 
-ccl_device_inline float3 microfacet_sample_stretched(
+ccl_device_forceinline float3 microfacet_sample_stretched(
 	KernelGlobals *kg, const float3 omega_i,
 	const float alpha_x, const float alpha_y,
 	const float randu, const float randv,
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 0a6dd4dcbdf..cea59adfebe 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 /* === GGX Microfacet distribution functions === */
 
 /* Isotropic GGX microfacet distribution */
-ccl_device_inline float D_ggx(float3 wm, float alpha)
+ccl_device_forceinline float D_ggx(float3 wm, float alpha)
 {
 	wm.z *= wm.z;
 	alpha *= alpha;
@@ -30,7 +30,7 @@ ccl_device_inline float D_ggx(float3 wm, float alpha)
 }
 
 /* Anisotropic GGX microfacet distribution */
-ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
+ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 {
 	float slope_x = -wm.x/alpha.x;
 	float slope_y = -wm.y/alpha.y;
@@ -40,7 +40,7 @@ ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
 }
 
 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 {
 	if(cosI > 0.9999f || cosI < 1e-6f) {
 		const float r = sqrtf(randU.x / (1.0f - randU.x));
@@ -78,7 +78,7 @@ ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 }
 
 /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
 {
 	const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
 	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
@@ -94,7 +94,7 @@ ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, con
 /* === Phase functions: Glossy, Diffuse and Glass === */
 
 /* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
 {
 	if(n && k)
 		*weight *= fresnel_conductor(dot(wi, wm), *n, *k);
@@ -102,7 +102,7 @@ ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, floa
 	return -wi + 2.0f * wm * dot(wi, wm);
 }
 
-ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
 {
 	if(w.z > 0.9999f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -132,7 +132,7 @@ ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda
 }
 
 /* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
+ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
 {
 	float3 tm, bm;
 	make_orthonormals(wm, &tm, &bm);
@@ -141,14 +141,14 @@ ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float ra
 	return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
 }
 
-ccl_device_inline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
+ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
 {
 	const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
 	return make_float3(v, v, v);
 }
 
 /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
-ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
+ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
 {
 	float cosI = dot(wi, wm);
 	float f = fresnel_dielectric_cos(cosI, eta);
@@ -162,7 +162,7 @@ ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta,
 	return normalize(wm*(cosI*inv_eta + cosT) - wi*inv_eta);
 }
 
-ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta)
+ccl_device_forceinline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta)
 {
 	if(w.z > 0.9999f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -195,7 +195,7 @@ ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda,
 /* === Utility functions for the random walks === */
 
 /* Smith Lambda function for GGX (based on page 12 of the supplemental implementation). */
-ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
+ccl_device_forceinline float mf_lambda(const float3 w, const float2 alpha)
 {
 	if(w.z > 0.9999f)
 		return 0.0f;
@@ -212,18 +212,18 @@ ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
 }
 
 /* Height distribution CDF (based on page 4 of the supplemental implementation). */
-ccl_device_inline float mf_invC1(const float h)
+ccl_device_forceinline float mf_invC1(const float h)
 {
 	return 2.0f * saturate(h) - 1.0f;
 }
 
-ccl_device_inline float mf_C1(const float h)
+ccl_device_forceinline float mf_C1(const float h)
 {
 	return saturate(0.5f * (h + 1.0f));
 }
 
 /* Masking function (based on page 16 of the supplemental implementation). */
-ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda)
+ccl_device_forceinline float mf_G1(const float3 w, const float C1, const float lambda)
 {
 	if(w.z > 0.9999f)
 		return 1.0f;
@@ -233,7 +233,7 @@ ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda
 }
 
 /* Sampling from the visible height distribution (based on page 17 of the supplemental implementation). */
-ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
+ccl_device_forceinline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
 {
 	if(w.z > 0.9999f)
 		return false;
@@ -262,14 +262,14 @@ ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, flo
 
 /* Approximation for the albedo of the single-scattering GGX distribution,
  * the missing energy is then approximated as a diffuse reflection for the PDF. */
-ccl_device_inline float mf_ggx_albedo(float r)
+ccl_device_forceinline float mf_ggx_albedo(float r)
 {
 	float albedo = 0.806495f*expf(-1.98712f*r*r) + 0.199531f;
 	albedo -= ((((((1.76741f*r - 8.43891f)*r + 15.784f)*r - 14.398f)*r + 6.45221f)*r - 1.19722f)*r + 0.027803f)*r + 0.00568739f;
 	return saturate(albedo);
 }
 
-ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
+ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
 {
 	float D = D_ggx(normalize(wi+wo), alpha);
 	float lambda = mf_lambda(wi, make_float2(alpha, alpha));
@@ -277,17 +277,17 @@ ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float
 	return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
 }
 
-ccl_device_inline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
+ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
 {
 	return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
 }
 
-ccl_device_inline float mf_diffuse_pdf(const float3 wo)
+ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
 {
 	return M_1_PI_F * wo.z;
 }
 
-ccl_device_inline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
+ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
 {
 	float3 wh;
 	float fresnel;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 6ebe2f6a751..8054fa8e849 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -25,7 +25,7 @@
  * energy is used. In combination with MIS, that is enough to produce an unbiased result, although
  * the balance heuristic isn't necessarily optimal anymore.
  */
-ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
         float3 wi,
         float3 wo,
         const bool wo_outside,
@@ -168,7 +168,7 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
  * escaped the surface in wo. The function returns the throughput between wi and wo.
  * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
  */
-ccl_device float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
 #ifdef MF_MULTI_GLASS
 	, const float eta
 #elif defined(MF_MULTI_GLOSSY)
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 35c95768b69..af0bbd861a9 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -141,7 +141,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r)
 }
 
 /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi)
+ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
 {
 	/* newton-raphson iteration, usually succeeds in 2-4 iterations, except
 	 * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
@@ -255,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r)
  * Returns scaled radius, meaning the result is to be scaled up by d.
  * Since there's no closed form solution we do Newton-Raphson method to find it.
  */
-ccl_device_inline float bssrdf_burley_root_find(float xi)
+ccl_device_forceinline float bssrdf_burley_root_find(float xi)
 {
 	const float tolerance = 1e-6f;
 	const int max_iteration_count = 10;
@@ -389,7 +389,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
 		bssrdf_burley_sample(sc, xi, r, h);
 }
 
-ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r)
+ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
 {
 	if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
 		return bssrdf_cubic_pdf(sc, r);
author	Brecht Van Lommel <brechtvanlommel@gmail.com>	2016-10-02 15:48:39 +0300
committer	Brecht Van Lommel <brechtvanlommel@gmail.com>	2016-10-03 23:15:25 +0300
commit	a3abb020e37a072eb71fd30de9ab125d1c16623a (patch)
tree	b525be7f8a0792eedecb2b95802ede88dc3f330e /intern/cycles/kernel/closure
parent	49ad4215baf16d850d0e367f003ab688e4a3d08e (diff)