From 6353ecb996898b4ce2fe8065130ed1f5ea3b6989 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Mon, 1 Aug 2016 15:40:46 +0200 Subject: Cycles: Tweaks to support CUDA 8 toolkit All the changes are mainly giving explicit tips on inlining functions, so they match how inlining worked with previous toolkit. This make kernel compiled by CUDA 8 render in average with same speed as previous kernels. Some scenes are somewhat faster, some of them are somewhat slower. But slowdown is within 1% so far. On a positive side it allows us to enable newer generation cards on buildbots (so GTX 10x0 will be officially supported soon). --- intern/cycles/device/device_cuda.cpp | 6 +-- intern/cycles/kernel/bvh/bvh_shadow_all.h | 15 +++++--- intern/cycles/kernel/bvh/bvh_subsurface.h | 17 ++++++--- intern/cycles/kernel/bvh/bvh_traversal.h | 21 +++++++---- intern/cycles/kernel/bvh/bvh_volume.h | 13 +++++-- intern/cycles/kernel/bvh/bvh_volume_all.h | 15 +++++--- intern/cycles/kernel/closure/bsdf.h | 10 ++++- .../cycles/kernel/closure/bsdf_ashikhmin_shirley.h | 6 ++- .../kernel/closure/bsdf_microfacet_multi_impl.h | 13 +++++-- intern/cycles/kernel/closure/bssrdf.h | 6 +-- intern/cycles/kernel/geom/geom_primitive.h | 17 +++++++-- intern/cycles/kernel/geom/geom_volume.h | 4 +- intern/cycles/kernel/kernel_bake.h | 8 +++- intern/cycles/kernel/kernel_camera.h | 13 +++++-- intern/cycles/kernel/kernel_compat_cuda.h | 2 +- intern/cycles/kernel/kernel_light.h | 32 +++++++++------- intern/cycles/kernel/kernel_path.h | 2 +- intern/cycles/kernel/kernel_path_surface.h | 9 ++++- intern/cycles/kernel/kernel_path_volume.h | 10 ++++- intern/cycles/kernel/kernel_projection.h | 15 +++++--- intern/cycles/kernel/kernel_shader.h | 43 ++++++++++++++-------- intern/cycles/kernel/kernel_subsurface.h | 8 +++- intern/cycles/kernel/kernel_volume.h | 12 +++++- intern/cycles/kernel/svm/svm_attribute.h | 5 ++- intern/cycles/kernel/svm/svm_geometry.h | 18 +++++++-- intern/cycles/kernel/svm/svm_ramp.h | 12 +++--- intern/cycles/kernel/svm/svm_ramp_util.h | 10 ++--- intern/cycles/kernel/svm/svm_tex_coord.h | 24 ++++++------ intern/cycles/util/util_math.h | 8 ++-- 29 files changed, 249 insertions(+), 125 deletions(-) (limited to 'intern') diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 2d404918a38..80f2644fa8c 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -321,11 +321,11 @@ public: return ""; } if(cuda_version < 60) { - printf("Unsupported CUDA version %d.%d detected, you need CUDA 7.5.\n", cuda_version/10, cuda_version%10); + printf("Unsupported CUDA version %d.%d detected, you need CUDA 7.5 or newer.\n", cuda_version/10, cuda_version%10); return ""; } - else if(cuda_version != 75) - printf("CUDA version %d.%d detected, build may succeed but only CUDA 7.5 is officially supported.\n", cuda_version/10, cuda_version%10); + else if(cuda_version != 75 && cuda_version != 80) + printf("CUDA version %d.%d detected, build may succeed but only CUDA 7.5 and 8.0 are officially supported.\n", cuda_version/10, cuda_version%10); /* Compile. */ string kernel = path_join(kernel_path, path_join("kernels", path_join("cuda", "kernel.cu"))); diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index 1d6fa303d3e..e9eeff31ecc 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -37,11 +37,16 @@ * */ -ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect_array, - const uint max_hits, - uint *num_hits) +#ifndef __KERNEL_GPU__ +ccl_device +#else +ccl_device_inline +#endif +bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits, + uint *num_hits) { /* todo: * - likely and unlikely for if() statements diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h index 18978efcfa3..d9623c94b2e 100644 --- a/intern/cycles/kernel/bvh/bvh_subsurface.h +++ b/intern/cycles/kernel/bvh/bvh_subsurface.h @@ -35,12 +35,17 @@ * */ -ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - SubsurfaceIntersection *ss_isect, - int subsurface_object, - uint *lcg_state, - int max_hits) +#ifndef __KERNEL_GPU__ +ccl_device +#else +ccl_device_inline +#endif +void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + SubsurfaceIntersection *ss_isect, + int subsurface_object, + uint *lcg_state, + int max_hits) { /* todo: * - test if pushing distance on the stack helps (for non shadow rays) diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index 68a11b65ad7..b1a52968a26 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -40,16 +40,21 @@ * */ -ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility +#ifndef __KERNEL_GPU__ +ccl_device +#else +ccl_device_inline +#endif +bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , uint *lcg_state, - float difl, - float extmax + , uint *lcg_state, + float difl, + float extmax #endif - ) + ) { /* todo: * - test if pushing distance on the stack helps (for non shadow rays) diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index 03499e94347..107373c17dc 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -36,10 +36,15 @@ * */ -ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility) +#ifndef __KERNEL_GPU__ +ccl_device +#else +ccl_device_inline +#endif +bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility) { /* todo: * - test if pushing distance on the stack helps (for non shadow rays) diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index 7eddc2891d0..1f6515c9862 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -36,11 +36,16 @@ * */ -ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect_array, - const uint max_hits, - const uint visibility) +#ifndef __KERNEL_GPU__ +ccl_device +#else +ccl_device_inline +#endif +uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits, + const uint visibility) { /* todo: * - test if pushing distance on the stack helps (for non shadow rays) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index a251e3bdcf9..55bdf3ecbb4 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -36,7 +36,15 @@ CCL_NAMESPACE_BEGIN -ccl_device int bsdf_sample(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float randu, float randv, float3 *eval, float3 *omega_in, differential3 *domega_in, float *pdf) +ccl_device_inline int bsdf_sample(KernelGlobals *kg, + ShaderData *sd, + const ShaderClosure *sc, + float randu, + float randv, + float3 *eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) { int label; diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h index 8ed76bea525..9929246ae5c 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h @@ -62,7 +62,11 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough return 2.0f / (roughness*roughness) - 2.0f; } -ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) +ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect( + const ShaderClosure *sc, + const float3 I, + const float3 omega_in, + float *pdf) { const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc; float3 N = bsdf->N; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index afd4a8da62a..6ebe2f6a751 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -25,11 +25,18 @@ * energy is used. In combination with MIS, that is enough to produce an unbiased result, although * the balance heuristic isn't necessarily optimal anymore. */ -ccl_device float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi, float3 wo, const bool wo_outside, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint* lcg_state +ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)( + float3 wi, + float3 wo, + const bool wo_outside, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state #ifdef MF_MULTI_GLASS - , const float eta + , const float eta #elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k + , float3 *n, float3 *k #endif ) { diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index a260ae9a31b..35c95768b69 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -141,7 +141,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r) } /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */ -ccl_device float bssrdf_cubic_quintic_root_find(float xi) +ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi) { /* newton-raphson iteration, usually succeeds in 2-4 iterations, except * outside 0.02 ... 0.98 where it can go up to 10, so overall performance @@ -255,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r) * Returns scaled radius, meaning the result is to be scaled up by d. * Since there's no closed form solution we do Newton-Raphson method to find it. */ -ccl_device float bssrdf_burley_root_find(float xi) +ccl_device_inline float bssrdf_burley_root_find(float xi) { const float tolerance = 1e-6f; const int max_iteration_count = 10; @@ -389,7 +389,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float bssrdf_burley_sample(sc, xi, r, h); } -ccl_device float bssrdf_pdf(const ShaderClosure *sc, float r) +ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r) { if(sc->type == CLOSURE_BSSRDF_CUBIC_ID) return bssrdf_cubic_pdf(sc, r); diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index 44734d1b70d..b16f0c9a99b 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -23,7 +23,11 @@ CCL_NAMESPACE_BEGIN /* Generic primitive attribute reading functions */ -ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy) +ccl_device_inline float primitive_attribute_float(KernelGlobals *kg, + const ShaderData *sd, + AttributeElement elem, + int offset, + float *dx, float *dy) { if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) @@ -48,7 +52,12 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData * } } -ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy) +ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg, + const ShaderData *sd, + AttributeElement elem, + int offset, + float3 *dx, + float3 *dy) { if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) @@ -75,7 +84,7 @@ ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData /* Default UV coordinate */ -ccl_device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd) +ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd) { AttributeElement elem_uv; int offset_uv = find_attribute(kg, sd, ATTR_STD_UV, &elem_uv); @@ -144,7 +153,7 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) /* Motion vector for motion pass */ -ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) +ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) { /* center position */ float3 center; diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 2044aafc877..7c8182bc430 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -44,7 +44,9 @@ ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z) } #endif /* __KERNEL_GPU__ */ -ccl_device float3 volume_normalized_position(KernelGlobals *kg, const ShaderData *sd, float3 P) +ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, + const ShaderData *sd, + float3 P) { /* todo: optimize this so it's just a single matrix multiplication when * possible (not motion blur), or perhaps even just translation + scale */ diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 9ee0b09529e..bfbf73df54f 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -18,8 +18,12 @@ CCL_NAMESPACE_BEGIN #ifdef __BAKING__ -ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng, - int pass_filter, int sample) +ccl_device_inline void compute_light_pass(KernelGlobals *kg, + ShaderData *sd, + PathRadiance *L, + RNG rng, + int pass_filter, + int sample) { /* initialize master radiance accumulator */ kernel_assert(kernel_data.film.use_light_pass); diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index f6c103d59dd..88514de514c 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -211,7 +211,10 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl /* Panorama Camera */ -ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) +ccl_device_inline void camera_sample_panorama(KernelGlobals *kg, + float raster_x, float raster_y, + float lens_u, float lens_v, + ccl_addr_space Ray *ray) { Transform rastertocamera = kernel_data.cam.rastertocamera; float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f)); @@ -303,8 +306,12 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float /* Common */ -ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v, - float lens_u, float lens_v, float time, ccl_addr_space Ray *ray) +ccl_device_inline void camera_sample(KernelGlobals *kg, + int x, int y, + float filter_u, float filter_v, + float lens_u, float lens_v, + float time, + ccl_addr_space Ray *ray) { /* pixel filter */ int filter_table_offset = kernel_data.film.filter_table_offset; diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 08f6f457805..fb5812ebcb8 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -36,7 +36,7 @@ /* Qualifier wrappers for different names on different devices */ #define ccl_device __device__ __inline__ -#define ccl_device_inline __device__ __inline__ +#define ccl_device_inline __device__ __forceinline__ #define ccl_device_noinline __device__ __noinline__ #define ccl_global #define ccl_constant diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index 93c4bd3f7d5..1e829eaa1fa 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -44,11 +44,11 @@ typedef struct LightSample { * * Note: light_p is modified when sample_coord is true. */ -ccl_device float area_light_sample(float3 P, - float3 *light_p, - float3 axisu, float3 axisv, - float randu, float randv, - bool sample_coord) +ccl_device_inline float area_light_sample(float3 P, + float3 *light_p, + float3 axisu, float3 axisv, + float randu, float randv, + bool sample_coord) { /* In our name system we're using P for the center, * which is o in the paper. @@ -268,11 +268,11 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(KernelGlobals return false; } -ccl_device float background_portal_pdf(KernelGlobals *kg, - float3 P, - float3 direction, - int ignore_portal, - bool *is_possible) +ccl_device_inline float background_portal_pdf(KernelGlobals *kg, + float3 P, + float3 direction, + int ignore_portal, + bool *is_possible) { float portal_pdf = 0.0f; @@ -367,7 +367,10 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg, return make_float3(0.0f, 0.0f, 0.0f); } -ccl_device float3 background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf) +ccl_device_inline float3 background_light_sample(KernelGlobals *kg, + float3 P, + float randu, float randv, + float *pdf) { /* Probability of sampling portals instead of the map. */ float portal_sampling_pdf = kernel_data.integrator.portal_pdf; @@ -507,8 +510,11 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 return t*t/cos_pi; } -ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp, - float randu, float randv, float3 P, LightSample *ls) +ccl_device_inline void lamp_light_sample(KernelGlobals *kg, + int lamp, + float randu, float randv, + float3 P, + LightSample *ls) { float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0); float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1); diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index d5b31037723..1f08f3459e6 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -436,7 +436,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, #ifdef __SUBSURFACE__ -ccl_device bool kernel_path_subsurface_scatter( +ccl_device_inline bool kernel_path_subsurface_scatter( KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index 74b1ae0ca32..250b8e92a45 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -222,8 +222,13 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_ #endif /* path tracing: bounce off or through surface to with new direction stored in ray */ -ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng, - ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray) +ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, + ccl_addr_space RNG *rng, + ShaderData *sd, + ccl_addr_space float3 *throughput, + ccl_addr_space PathState *state, + PathRadiance *L, + ccl_addr_space Ray *ray) { /* no BSDF? we can stop here */ if(ccl_fetch(sd, flag) & SD_BSDF) { diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index e45522a4641..5fd4f2fad4c 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -18,8 +18,14 @@ CCL_NAMESPACE_BEGIN #ifdef __VOLUME_SCATTER__ -ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng, - ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L) +ccl_device_inline void kernel_path_volume_connect_light( + KernelGlobals *kg, + RNG *rng, + ShaderData *sd, + ShaderData *emission_sd, + float3 throughput, + PathState *state, + PathRadiance *L) { #ifdef __EMISSION__ if(!kernel_data.integrator.use_direct_light) diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h index 8be6742699a..3437d83ed7d 100644 --- a/intern/cycles/kernel/kernel_projection.h +++ b/intern/cycles/kernel/kernel_projection.h @@ -130,7 +130,10 @@ ccl_device float2 direction_to_fisheye_equisolid(float3 dir, float lens, float w return make_float2(u, v); } -ccl_device float3 fisheye_equisolid_to_direction(float u, float v, float lens, float fov, float width, float height) +ccl_device_inline float3 fisheye_equisolid_to_direction(float u, float v, + float lens, + float fov, + float width, float height) { u = (u - 0.5f) * width; v = (v - 0.5f) * height; @@ -189,7 +192,7 @@ ccl_device float2 direction_to_mirrorball(float3 dir) return make_float2(u, v); } -ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v) +ccl_device_inline float3 panorama_to_direction(KernelGlobals *kg, float u, float v) { switch(kernel_data.cam.panorama_type) { case PANORAMA_EQUIRECTANGULAR: @@ -205,7 +208,7 @@ ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v) } } -ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir) +ccl_device_inline float2 direction_to_panorama(KernelGlobals *kg, float3 dir) { switch(kernel_data.cam.panorama_type) { case PANORAMA_EQUIRECTANGULAR: @@ -221,9 +224,9 @@ ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir) } } -ccl_device float3 spherical_stereo_position(KernelGlobals *kg, - float3 dir, - float3 pos) +ccl_device_inline float3 spherical_stereo_position(KernelGlobals *kg, + float3 dir, + float3 pos) { float interocular_offset = kernel_data.cam.interocular_offset; diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index b7641c37d93..bb3fe933b2c 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -149,8 +149,11 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, /* ShaderData setup from BSSRDF scatter */ #ifdef __SUBSURFACE__ -ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData *sd, - const Intersection *isect, const Ray *ray) +ccl_device void shader_setup_from_subsurface( + KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray) { bool backfacing = sd->flag & SD_BACKFACING; @@ -226,14 +229,14 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat /* ShaderData setup from position sampled on mesh */ -ccl_device void shader_setup_from_sample(KernelGlobals *kg, - ShaderData *sd, - const float3 P, - const float3 Ng, - const float3 I, - int shader, int object, int prim, - float u, float v, float t, - float time) +ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, + ShaderData *sd, + const float3 P, + const float3 Ng, + const float3 I, + int shader, int object, int prim, + float u, float v, float t, + float time) { /* vectors */ ccl_fetch(sd, P) = P; @@ -445,7 +448,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s /* Merging */ #if defined(__BRANCHED_PATH__) || defined(__VOLUME__) -ccl_device void shader_merge_closures(ShaderData *sd) +ccl_device_inline void shader_merge_closures(ShaderData *sd) { /* merge identical closures, better when we sample a single closure at a time */ for(int i = 0; i < sd->num_closure; i++) { @@ -554,9 +557,13 @@ ccl_device void shader_bsdf_eval(KernelGlobals *kg, } } -ccl_device int shader_bsdf_sample(KernelGlobals *kg, ShaderData *sd, - float randu, float randv, BsdfEval *bsdf_eval, - float3 *omega_in, differential3 *domega_in, float *pdf) +ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, + ShaderData *sd, + float randu, float randv, + BsdfEval *bsdf_eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) { int sampled = 0; @@ -991,8 +998,12 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData * /* Volume Evaluation */ -ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd, - PathState *state, VolumeStack *stack, int path_flag, ShaderContext ctx) +ccl_device_inline void shader_eval_volume(KernelGlobals *kg, + ShaderData *sd, + PathState *state, + VolumeStack *stack, + int path_flag, + ShaderContext ctx) { /* reset closures once at the start, we will be accumulating the closures * for all volumes in the stack into a single array of closures */ diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index 28fa826fde7..61073155e70 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -85,7 +85,11 @@ ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, Sha return NULL; } -ccl_device float3 subsurface_scatter_eval(ShaderData *sd, ShaderClosure *sc, float disk_r, float r, bool all) +ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, + ShaderClosure *sc, + float disk_r, + float r, + bool all) { #ifdef BSSRDF_MULTI_EVAL /* this is the veach one-sample model with balance heuristic, some pdf @@ -214,7 +218,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, /* Subsurface scattering step, from a point on the surface to other * nearby points on the same object. */ -ccl_device int subsurface_scatter_multi_intersect( +ccl_device_inline int subsurface_scatter_multi_intersect( KernelGlobals *kg, SubsurfaceIntersection* ss_isect, ShaderData *sd, diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index 01c87e6d89d..9dafed9afd1 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -36,7 +36,11 @@ typedef struct VolumeShaderCoefficients { } VolumeShaderCoefficients; /* evaluate shader to get extinction coefficient at P */ -ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *sd, PathState *state, float3 P, float3 *extinction) +ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg, + ShaderData *sd, + PathState *state, + float3 P, + float3 *extinction) { sd->P = P; shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); @@ -58,7 +62,11 @@ ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *s } /* evaluate shader to get absorption, scattering and emission at P */ -ccl_device bool volume_shader_sample(KernelGlobals *kg, ShaderData *sd, PathState *state, float3 P, VolumeShaderCoefficients *coeff) +ccl_device_inline bool volume_shader_sample(KernelGlobals *kg, + ShaderData *sd, + PathState *state, + float3 P, + VolumeShaderCoefficients *coeff) { sd->P = P; shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME); diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h index 6c557684099..ff92920c610 100644 --- a/intern/cycles/kernel/svm/svm_attribute.h +++ b/intern/cycles/kernel/svm/svm_attribute.h @@ -123,7 +123,10 @@ ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float * } } -ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { NodeAttributeType type, mesh_type; AttributeElement elem; diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index bb06254c3a9..7d512f7ff4d 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -18,7 +18,11 @@ CCL_NAMESPACE_BEGIN /* Geometry Node */ -ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device_inline void svm_node_geometry(KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint type, + uint out_offset) { float3 data; @@ -94,7 +98,11 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s /* Particle Info */ -ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device void svm_node_particle_info(KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint type, + uint out_offset) { switch(type) { case NODE_INFO_PAR_INDEX: { @@ -146,7 +154,11 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float /* Hair Info */ -ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device void svm_node_hair_info(KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint type, + uint out_offset) { float data; float3 data3; diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h index f959d90f309..368740f64c7 100644 --- a/intern/cycles/kernel/svm/svm_ramp.h +++ b/intern/cycles/kernel/svm/svm_ramp.h @@ -21,12 +21,12 @@ CCL_NAMESPACE_BEGIN /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */ -ccl_device float4 rgb_ramp_lookup(KernelGlobals *kg, - int offset, - float f, - bool interpolate, - bool extrapolate, - int table_size) +ccl_device_inline float4 rgb_ramp_lookup(KernelGlobals *kg, + int offset, + float f, + bool interpolate, + bool extrapolate, + int table_size) { if((f < 0.0f || f > 1.0f) && extrapolate) { float4 t0, dy; diff --git a/intern/cycles/kernel/svm/svm_ramp_util.h b/intern/cycles/kernel/svm/svm_ramp_util.h index 495d98cf250..9f2ce1276f9 100644 --- a/intern/cycles/kernel/svm/svm_ramp_util.h +++ b/intern/cycles/kernel/svm/svm_ramp_util.h @@ -21,11 +21,11 @@ CCL_NAMESPACE_BEGIN /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */ -ccl_device float3 rgb_ramp_lookup(const float3 *ramp, - float f, - bool interpolate, - bool extrapolate, - int table_size) +ccl_device_inline float3 rgb_ramp_lookup(const float3 *ramp, + float f, + bool interpolate, + bool extrapolate, + int table_size) { if ((f < 0.0f || f > 1.0f) && extrapolate) { float3 t0, dy; diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h index 276b6f26f5e..b39d6a3e009 100644 --- a/intern/cycles/kernel/svm/svm_tex_coord.h +++ b/intern/cycles/kernel/svm/svm_tex_coord.h @@ -99,12 +99,12 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, stack_store_float3(stack, out_offset, data); } -ccl_device_inline void svm_node_tex_coord_bump_dx(KernelGlobals *kg, - ShaderData *sd, - int path_flag, - float *stack, - uint4 node, - int *offset) +ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, + ShaderData *sd, + int path_flag, + float *stack, + uint4 node, + int *offset) { #ifdef __RAY_DIFFERENTIALS__ float3 data; @@ -184,12 +184,12 @@ ccl_device_inline void svm_node_tex_coord_bump_dx(KernelGlobals *kg, #endif } -ccl_device_inline void svm_node_tex_coord_bump_dy(KernelGlobals *kg, - ShaderData *sd, - int path_flag, - float *stack, - uint4 node, - int *offset) +ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, + ShaderData *sd, + int path_flag, + float *stack, + uint4 node, + int *offset) { #ifdef __RAY_DIFFERENTIALS__ float3 data; diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 016f4a6a794..13aba0646d2 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -1477,10 +1477,10 @@ ccl_device bool ray_triangle_intersect( return true; } -ccl_device bool ray_triangle_intersect_uv( - float3 ray_P, float3 ray_D, float ray_t, - float3 v0, float3 v1, float3 v2, - float *isect_u, float *isect_v, float *isect_t) +ccl_device_inline bool ray_triangle_intersect_uv( + float3 ray_P, float3 ray_D, float ray_t, + float3 v0, float3 v1, float3 v2, + float *isect_u, float *isect_v, float *isect_t) { /* Calculate intersection */ float3 e1 = v1 - v0; -- cgit v1.2.3