Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/kernel/bvh/bvh.h17
-rw-r--r--intern/cycles/kernel/bvh/bvh_nodes.h30
-rw-r--r--intern/cycles/kernel/bvh/bvh_traversal.h21
-rw-r--r--intern/cycles/kernel/bvh/bvh_types.h2
-rw-r--r--intern/cycles/kernel/closure/bsdf.h4
-rw-r--r--intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h2
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet.h2
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi.h40
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h4
-rw-r--r--intern/cycles/kernel/closure/bssrdf.h6
-rw-r--r--intern/cycles/kernel/geom/geom_curve.h6
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h8
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h1
-rw-r--r--intern/cycles/kernel/kernel_compat_opencl.h1
-rw-r--r--intern/cycles/kernel/kernel_path.h6
-rw-r--r--intern/cycles/kernel/kernel_path_branched.h4
-rw-r--r--intern/cycles/kernel/kernel_random.h12
-rw-r--r--intern/cycles/kernel/kernel_shadow.h6
-rw-r--r--intern/cycles/kernel/osl/osl_services.cpp2
-rw-r--r--intern/cycles/util/util_types.h2
20 files changed, 82 insertions, 94 deletions
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 7cecee793c1..36798982653 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -157,8 +157,9 @@ CCL_NAMESPACE_BEGIN
#undef BVH_NAME_EVAL
#undef BVH_FUNCTION_FULL_NAME
+/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
- const Ray *ray,
+ const Ray ray,
const uint visibility,
Intersection *isect,
uint *lcg_state,
@@ -169,32 +170,32 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
if(kernel_data.bvh.have_motion) {
# ifdef __HAIR__
if(kernel_data.bvh.have_curves)
- return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
+ return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax);
# endif /* __HAIR__ */
- return bvh_intersect_motion(kg, ray, isect, visibility);
+ return bvh_intersect_motion(kg, &ray, isect, visibility);
}
#endif /* __OBJECT_MOTION__ */
#ifdef __HAIR__
if(kernel_data.bvh.have_curves)
- return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
+ return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax);
#endif /* __HAIR__ */
#ifdef __KERNEL_CPU__
# ifdef __INSTANCING__
if(kernel_data.bvh.have_instancing)
- return bvh_intersect_instancing(kg, ray, isect, visibility);
+ return bvh_intersect_instancing(kg, &ray, isect, visibility);
# endif /* __INSTANCING__ */
- return bvh_intersect(kg, ray, isect, visibility);
+ return bvh_intersect(kg, &ray, isect, visibility);
#else /* __KERNEL_CPU__ */
# ifdef __INSTANCING__
- return bvh_intersect_instancing(kg, ray, isect, visibility);
+ return bvh_intersect_instancing(kg, &ray, isect, visibility);
# else
- return bvh_intersect(kg, ray, isect, visibility);
+ return bvh_intersect(kg, &ray, isect, visibility);
# endif /* __INSTANCING__ */
#endif /* __KERNEL_CPU__ */
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index db2275b0ff8..726bef1794c 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -16,7 +16,7 @@
// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
// 3-vector which might be faster.
-ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
int node_addr,
int child)
{
@@ -30,7 +30,7 @@ ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
}
#if !defined(__KERNEL_SSE2__)
-ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 idir,
const float t,
@@ -77,7 +77,7 @@ ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
#endif
}
-ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 idir,
const float t,
@@ -139,7 +139,7 @@ ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
#endif
}
-ccl_device_inline bool bvh_unaligned_node_intersect_child(
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(
KernelGlobals *kg,
const float3 P,
const float3 dir,
@@ -166,7 +166,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child(
return tnear <= tfar;
}
-ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
KernelGlobals *kg,
const float3 P,
const float3 dir,
@@ -202,7 +202,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
}
}
-ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
@@ -232,7 +232,7 @@ ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
return mask;
}
-ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
@@ -264,7 +264,7 @@ ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
return mask;
}
-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
@@ -295,7 +295,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
}
}
-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
@@ -333,7 +333,7 @@ ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
}
#else /* !defined(__KERNEL_SSE2__) */
-int ccl_device_inline bvh_aligned_node_intersect(
+int ccl_device_forceinline bvh_aligned_node_intersect(
KernelGlobals *kg,
const float3& P,
const float3& dir,
@@ -377,7 +377,7 @@ int ccl_device_inline bvh_aligned_node_intersect(
# endif
}
-int ccl_device_inline bvh_aligned_node_intersect_robust(
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(
KernelGlobals *kg,
const float3& P,
const float3& dir,
@@ -441,7 +441,7 @@ int ccl_device_inline bvh_aligned_node_intersect_robust(
# endif
}
-int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 dir,
const ssef& isect_near,
@@ -502,7 +502,7 @@ int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
# endif
}
-int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 dir,
const ssef& isect_near,
@@ -573,7 +573,7 @@ int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
# endif
}
-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
const float3& P,
const float3& dir,
const ssef& isect_near,
@@ -611,7 +611,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
}
}
-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
const float3& P,
const float3& dir,
const ssef& isect_near,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index b1a52968a26..a0e478e972b 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -40,21 +40,16 @@
*
*/
-#ifndef __KERNEL_GPU__
-ccl_device
-#else
-ccl_device_inline
-#endif
-bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
- const Ray *ray,
- Intersection *isect,
- const uint visibility
+ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect,
+ const uint visibility
#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
- , uint *lcg_state,
- float difl,
- float extmax
+ , uint *lcg_state,
+ float difl,
+ float extmax
#endif
- )
+ )
{
/* todo:
* - test if pushing distance on the stack helps (for non shadow rays)
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index 27729046f8d..c3abe2e157d 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
/* Don't inline intersect functions on GPU, this is faster */
#ifdef __KERNEL_GPU__
-# define ccl_device_intersect ccl_device_noinline
+# define ccl_device_intersect ccl_device_forceinline
#else
# define ccl_device_intersect ccl_device_inline
#endif
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 2ab92badc93..7e4d5fe2e37 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -36,7 +36,7 @@
CCL_NAMESPACE_BEGIN
-ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
ShaderData *sd,
const ShaderClosure *sc,
float randu,
@@ -147,7 +147,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
#ifndef __KERNEL_CUDA__
ccl_device
#else
-ccl_device_inline
+ccl_device_forceinline
#endif
float3 bsdf_eval(KernelGlobals *kg,
ShaderData *sd,
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 9929246ae5c..1cd8246aa71 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -62,7 +62,7 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough
return 2.0f / (roughness*roughness) - 2.0f;
}
-ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect(
+ccl_device_forceinline float3 bsdf_ashikhmin_shirley_eval_reflect(
const ShaderClosure *sc,
const float3 I,
const float3 omega_in,
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 7c36f05b6cc..0a8d14a00c2 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -183,7 +183,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(
*slope_y = S * z * safe_sqrtf(1.0f + (*slope_x)*(*slope_x));
}
-ccl_device_inline float3 microfacet_sample_stretched(
+ccl_device_forceinline float3 microfacet_sample_stretched(
KernelGlobals *kg, const float3 omega_i,
const float alpha_x, const float alpha_y,
const float randu, const float randv,
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 0a6dd4dcbdf..cea59adfebe 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
/* === GGX Microfacet distribution functions === */
/* Isotropic GGX microfacet distribution */
-ccl_device_inline float D_ggx(float3 wm, float alpha)
+ccl_device_forceinline float D_ggx(float3 wm, float alpha)
{
wm.z *= wm.z;
alpha *= alpha;
@@ -30,7 +30,7 @@ ccl_device_inline float D_ggx(float3 wm, float alpha)
}
/* Anisotropic GGX microfacet distribution */
-ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
+ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
{
float slope_x = -wm.x/alpha.x;
float slope_y = -wm.y/alpha.y;
@@ -40,7 +40,7 @@ ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
}
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
{
if(cosI > 0.9999f || cosI < 1e-6f) {
const float r = sqrtf(randU.x / (1.0f - randU.x));
@@ -78,7 +78,7 @@ ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
}
/* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
{
const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
@@ -94,7 +94,7 @@ ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, con
/* === Phase functions: Glossy, Diffuse and Glass === */
/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
{
if(n && k)
*weight *= fresnel_conductor(dot(wi, wm), *n, *k);
@@ -102,7 +102,7 @@ ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, floa
return -wi + 2.0f * wm * dot(wi, wm);
}
-ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
{
if(w.z > 0.9999f)
return make_float3(0.0f, 0.0f, 0.0f);
@@ -132,7 +132,7 @@ ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda
}
/* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
+ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
{
float3 tm, bm;
make_orthonormals(wm, &tm, &bm);
@@ -141,14 +141,14 @@ ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float ra
return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
}
-ccl_device_inline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
+ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
{
const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
return make_float3(v, v, v);
}
/* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
-ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
+ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
{
float cosI = dot(wi, wm);
float f = fresnel_dielectric_cos(cosI, eta);
@@ -162,7 +162,7 @@ ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta,
return normalize(wm*(cosI*inv_eta + cosT) - wi*inv_eta);
}
-ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta)
+ccl_device_forceinline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta)
{
if(w.z > 0.9999f)
return make_float3(0.0f, 0.0f, 0.0f);
@@ -195,7 +195,7 @@ ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda,
/* === Utility functions for the random walks === */
/* Smith Lambda function for GGX (based on page 12 of the supplemental implementation). */
-ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
+ccl_device_forceinline float mf_lambda(const float3 w, const float2 alpha)
{
if(w.z > 0.9999f)
return 0.0f;
@@ -212,18 +212,18 @@ ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
}
/* Height distribution CDF (based on page 4 of the supplemental implementation). */
-ccl_device_inline float mf_invC1(const float h)
+ccl_device_forceinline float mf_invC1(const float h)
{
return 2.0f * saturate(h) - 1.0f;
}
-ccl_device_inline float mf_C1(const float h)
+ccl_device_forceinline float mf_C1(const float h)
{
return saturate(0.5f * (h + 1.0f));
}
/* Masking function (based on page 16 of the supplemental implementation). */
-ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda)
+ccl_device_forceinline float mf_G1(const float3 w, const float C1, const float lambda)
{
if(w.z > 0.9999f)
return 1.0f;
@@ -233,7 +233,7 @@ ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda
}
/* Sampling from the visible height distribution (based on page 17 of the supplemental implementation). */
-ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
+ccl_device_forceinline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
{
if(w.z > 0.9999f)
return false;
@@ -262,14 +262,14 @@ ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, flo
/* Approximation for the albedo of the single-scattering GGX distribution,
* the missing energy is then approximated as a diffuse reflection for the PDF. */
-ccl_device_inline float mf_ggx_albedo(float r)
+ccl_device_forceinline float mf_ggx_albedo(float r)
{
float albedo = 0.806495f*expf(-1.98712f*r*r) + 0.199531f;
albedo -= ((((((1.76741f*r - 8.43891f)*r + 15.784f)*r - 14.398f)*r + 6.45221f)*r - 1.19722f)*r + 0.027803f)*r + 0.00568739f;
return saturate(albedo);
}
-ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
+ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
{
float D = D_ggx(normalize(wi+wo), alpha);
float lambda = mf_lambda(wi, make_float2(alpha, alpha));
@@ -277,17 +277,17 @@ ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float
return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
}
-ccl_device_inline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
+ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
{
return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
}
-ccl_device_inline float mf_diffuse_pdf(const float3 wo)
+ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
{
return M_1_PI_F * wo.z;
}
-ccl_device_inline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
+ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
{
float3 wh;
float fresnel;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 6ebe2f6a751..8054fa8e849 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -25,7 +25,7 @@
* energy is used. In combination with MIS, that is enough to produce an unbiased result, although
* the balance heuristic isn't necessarily optimal anymore.
*/
-ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
float3 wi,
float3 wo,
const bool wo_outside,
@@ -168,7 +168,7 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
* escaped the surface in wo. The function returns the throughput between wi and wo.
* Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
*/
-ccl_device float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
#ifdef MF_MULTI_GLASS
, const float eta
#elif defined(MF_MULTI_GLOSSY)
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 35c95768b69..af0bbd861a9 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -141,7 +141,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r)
}
/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi)
+ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
{
/* newton-raphson iteration, usually succeeds in 2-4 iterations, except
* outside 0.02 ... 0.98 where it can go up to 10, so overall performance
@@ -255,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r)
* Returns scaled radius, meaning the result is to be scaled up by d.
* Since there's no closed form solution we do Newton-Raphson method to find it.
*/
-ccl_device_inline float bssrdf_burley_root_find(float xi)
+ccl_device_forceinline float bssrdf_burley_root_find(float xi)
{
const float tolerance = 1e-6f;
const int max_iteration_count = 10;
@@ -389,7 +389,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
bssrdf_burley_sample(sc, xi, r, h);
}
-ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r)
+ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
{
if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
return bssrdf_cubic_pdf(sc, r);
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index aa9cd295452..84aaaab7453 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -222,10 +222,10 @@ ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
#ifdef __KERNEL_SSE2__
/* Pass P and dir by reference to aligned vector */
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
#else
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
#endif
{
@@ -621,7 +621,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
return hit;
}
-ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
{
/* define few macros to minimize code duplication for SSE */
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index dd5752f4c91..623c1dcaaa1 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -54,13 +54,7 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
}
}
-/* TODO(sergey): This is just a workaround for annoying 6.5 compiler bug. */
-#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ < 500
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
{
#ifdef __PASSES__
if(eval->use_light_pass) {
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 9a96cb9f438..e0c7b17c6a0 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -37,6 +37,7 @@
/* Qualifier wrappers for different names on different devices */
#define ccl_device __device__ __inline__
+# define ccl_device_forceinline __device__ __forceinline__
#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
# define ccl_device_inline __device__ __forceinline__
#else
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 2ae89dde7c4..f076e3a7d37 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -33,6 +33,7 @@
/* in opencl all functions are device functions, so leave this empty */
#define ccl_device
#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
#define ccl_device_noinline ccl_device ccl_noinline
#define ccl_may_alias
#define ccl_constant __constant
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 903be4f09a0..7558fb94478 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -69,7 +69,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
Intersection isect;
uint visibility = path_state_ray_visibility(kg, state);
bool hit = scene_intersect(kg,
- ray,
+ *ray,
visibility,
&isect,
NULL,
@@ -655,9 +655,9 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
}
- bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+ bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
#else
- bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+ bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
#endif
#ifdef __KERNEL_DEBUG__
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index e38c1a01f6b..cdb07db587a 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -282,9 +282,9 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
}
- bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+ bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
#else
- bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+ bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
#endif
#ifdef __KERNEL_DEBUG__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index b534d9950c5..4a76ffddbe7 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
return index;
}
-ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -132,13 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng,
#endif
}
-/* Temporary workaround for Pascal cards, otherwise AA does not work properly. */
-#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ >= 600
-__device__ __forceinline__
-#else
-ccl_device_inline
-#endif
-void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -199,7 +193,7 @@ ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG
/* Linear Congruential Generator */
-ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
{
/* implicit mod 2^32 */
rng = (1103515245*(rng) + 12345);
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 95b57404a77..2981f6ac566 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -155,7 +155,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
}
else {
Intersection isect;
- blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+ blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
}
#ifdef __VOLUME__
@@ -205,7 +205,7 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
Intersection *isect = &isect_object;
#endif
- bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
+ bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
#ifdef __TRANSPARENT_SHADOWS__
if(blocked && kernel_data.integrator.transparent_shadows) {
@@ -221,7 +221,7 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
if(bounce >= kernel_data.integrator.transparent_max_bounce)
return true;
- if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
+ if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
{
#ifdef __VOLUME__
/* attenuation for last line segment towards light */
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index f61a9ec0fb1..0f3edcb7eaa 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -1153,7 +1153,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
tracedata->sd.osl_globals = sd->osl_globals;
/* raytrace */
- return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
+ return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
}
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 257c6ad7491..6af65f88a02 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -42,6 +42,7 @@
#if defined(_WIN32) && !defined(FREE_WINDOWS)
#define ccl_device_inline static __forceinline
+#define ccl_device_forceinline static __forceinline
#define ccl_align(...) __declspec(align(__VA_ARGS__))
#ifdef __KERNEL_64_BIT__
#define ccl_try_align(...) __declspec(align(__VA_ARGS__))
@@ -56,6 +57,7 @@
#else
#define ccl_device_inline static inline __attribute__((always_inline))
+#define ccl_device_forceinline static inline __attribute__((always_inline))
#define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
#ifndef FREE_WINDOWS64
#define __forceinline inline __attribute__((always_inline))