diff options
author | Sergey Sharybin <sergey.vfx@gmail.com> | 2016-08-01 16:40:46 +0300 |
---|---|---|
committer | Sergey Sharybin <sergey.vfx@gmail.com> | 2016-08-01 16:54:29 +0300 |
commit | 6353ecb996898b4ce2fe8065130ed1f5ea3b6989 (patch) | |
tree | b6d620152e4ff7920465d8396fe443dc9b3ffc56 /intern | |
parent | 7065022f7aa23ba13d2999e1e40162a8f480af0e (diff) |
Cycles: Tweaks to support CUDA 8 toolkit
All the changes are mainly giving explicit tips on inlining functions,
so they match how inlining worked with previous toolkit.
This make kernel compiled by CUDA 8 render in average with same speed
as previous kernels. Some scenes are somewhat faster, some of them are
somewhat slower. But slowdown is within 1% so far.
On a positive side it allows us to enable newer generation cards on
buildbots (so GTX 10x0 will be officially supported soon).
Diffstat (limited to 'intern')
29 files changed, 249 insertions, 125 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 2d404918a38..80f2644fa8c 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -321,11 +321,11 @@ public: return ""; } if(cuda_version < 60) { - printf("Unsupported CUDA version %d.%d detected, you need CUDA 7.5.\n", cuda_version/10, cuda_version%10); + printf("Unsupported CUDA version %d.%d detected, you need CUDA 7.5 or newer.\n", cuda_version/10, cuda_version%10); return ""; } - else if(cuda_version != 75) - printf("CUDA version %d.%d detected, build may succeed but only CUDA 7.5 is officially supported.\n", cuda_version/10, cuda_version%10); + else if(cuda_version != 75 && cuda_version != 80) + printf("CUDA version %d.%d detected, build may succeed but only CUDA 7.5 and 8.0 are officially supported.\n", cuda_version/10, cuda_version%10); /* Compile. */ string kernel = path_join(kernel_path, path_join("kernels", path_join("cuda", "kernel.cu"))); diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index 1d6fa303d3e..e9eeff31ecc 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -37,11 +37,16 @@ * */ -ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect_array, - const uint max_hits, - uint *num_hits) +#ifndef __KERNEL_GPU__ +ccl_device +#else +ccl_device_inline +#endif +bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits, + uint *num_hits) { /* todo: * - likely and unlikely for if() statements diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h index 18978efcfa3..d9623c94b2e 100644 --- a/intern/cycles/kernel/bvh/bvh_subsurface.h +++ b/intern/cycles/kernel/bvh/bvh_subsurface.h @@ -35,12 +35,17 @@ * */ -ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - SubsurfaceIntersection *ss_isect, - int subsurface_object, - uint *lcg_state, - int max_hits) +#ifndef __KERNEL_GPU__ +ccl_device +#else +ccl_device_inline +#endif +void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + SubsurfaceIntersection *ss_isect, + int subsurface_object, + uint *lcg_state, + int max_hits) { /* todo: * - test if pushing distance on the stack helps (for non shadow rays) diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index 68a11b65ad7..b1a52968a26 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -40,16 +40,21 @@ * */ -ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility +#ifndef __KERNEL_GPU__ +ccl_device +#else +ccl_device_inline +#endif +bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , uint *lcg_state, - float difl, - float extmax + , uint *lcg_state, + float difl, + float extmax #endif - ) + ) { /* todo: * - test if pushing distance on the stack helps (for non shadow rays) diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index 03499e94347..107373c17dc 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -36,10 +36,15 @@ * */ -ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility) +#ifndef __KERNEL_GPU__ +ccl_device +#else +ccl_device_inline +#endif +bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility) { /* todo: * - test if pushing distance on the stack helps (for non shadow rays) diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index 7eddc2891d0..1f6515c9862 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -36,11 +36,16 @@ * */ -ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect_array, - const uint max_hits, - const uint visibility) +#ifndef __KERNEL_GPU__ +ccl_device +#else +ccl_device_inline +#endif +uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect_array, + const uint max_hits, + const uint visibility) { /* todo: * - test if pushing distance on the stack helps (for non shadow rays) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index a251e3bdcf9..55bdf3ecbb4 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -36,7 +36,15 @@ CCL_NAMESPACE_BEGIN -ccl_device int bsdf_sample(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float randu, float randv, float3 *eval, float3 *omega_in, differential3 *domega_in, float *pdf) +ccl_device_inline int bsdf_sample(KernelGlobals *kg, + ShaderData *sd, + const ShaderClosure *sc, + float randu, + float randv, + float3 *eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) { int label; diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h index 8ed76bea525..9929246ae5c 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h @@ -62,7 +62,11 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough return 2.0f / (roughness*roughness) - 2.0f; } -ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) +ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect( + const ShaderClosure *sc, + const float3 I, + const float3 omega_in, + float *pdf) { const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc; float3 N = bsdf->N; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index afd4a8da62a..6ebe2f6a751 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -25,11 +25,18 @@ * energy is used. In combination with MIS, that is enough to produce an unbiased result, although * the balance heuristic isn't necessarily optimal anymore. */ -ccl_device float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi, float3 wo, const bool wo_outside, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint* lcg_state +ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)( + float3 wi, + float3 wo, + const bool wo_outside, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state #ifdef MF_MULTI_GLASS - , const float eta + , const float eta #elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k + , float3 *n, float3 *k #endif ) { diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index a260ae9a31b..35c95768b69 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -141,7 +141,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r) } /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */ -ccl_device float bssrdf_cubic_quintic_root_find(float xi) +ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi) { /* newton-raphson iteration, usually succeeds in 2-4 iterations, except * outside 0.02 ... 0.98 where it can go up to 10, so overall performance @@ -255,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r) * Returns scaled radius, meaning the result is to be scaled up by d. * Since there's no closed form solution we do Newton-Raphson method to find it. */ -ccl_device float bssrdf_burley_root_find(float xi) +ccl_device_inline float bssrdf_burley_root_find(float xi) { const float tolerance = 1e-6f; const int max_iteration_count = 10; @@ -389,7 +389,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float bssrdf_burley_sample(sc, xi, r, h); } -ccl_device float bssrdf_pdf(const ShaderClosure *sc, float r) +ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r) { if(sc->type == CLOSURE_BSSRDF_CUBIC_ID) return bssrdf_cubic_pdf(sc, r); diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index 44734d1b70d..b16f0c9a99b 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -23,7 +23,11 @@ CCL_NAMESPACE_BEGIN /* Generic primitive attribute reading functions */ -ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy) +ccl_device_inline float primitive_attribute_float(KernelGlobals *kg, + const ShaderData *sd, + AttributeElement elem, + int offset, + float *dx, float *dy) { if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) @@ -48,7 +52,12 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData * } } -ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy) +ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg, + const ShaderData *sd, + AttributeElement elem, + int offset, + float3 *dx, + float3 *dy) { if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) @@ -75,7 +84,7 @@ ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData /* Default UV coordinate */ -ccl_device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd) +ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd) { AttributeElement elem_uv; int offset_uv = find_attribute(kg, sd, ATTR_STD_UV, &elem_uv); @@ -144,7 +153,7 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) /* Motion vector for motion pass */ -ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) +ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) { /* center position */ float3 center; diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 2044aafc877..7c8182bc430 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -44,7 +44,9 @@ ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z) } #endif /* __KERNEL_GPU__ */ -ccl_device float3 volume_normalized_position(KernelGlobals *kg, const ShaderData *sd, float3 P) +ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, + const ShaderData *sd, + float3 P) { /* todo: optimize this so it's just a single matrix multiplication when * possible (not motion blur), or perhaps even just translation + scale */ diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 9ee0b09529e..bfbf73df54f 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -18,8 +18,12 @@ CCL_NAMESPACE_BEGIN #ifdef __BAKING__ -ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng, - int pass_filter, int sample) +ccl_device_inline void compute_light_pass(KernelGlobals *kg, + ShaderData *sd, + PathRadiance *L, + RNG rng, + int pass_filter, + int sample) { /* initialize master radiance accumulator */ kernel_assert(kernel_data.film.use_light_pass); diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index f6c103d59dd..88514de514c 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -211,7 +211,10 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl /* Panorama Camera */ -ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) +ccl_device_inline void camera_sample_panorama(KernelGlobals *kg, + float raster_x, float raster_y, + float lens_u, float lens_v, + ccl_addr_space Ray *ray) { Transform rastertocamera = kernel_data.cam.rastertocamera; float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f)); @@ -303,8 +306,12 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float /* Common */ -ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v, - float lens_u, float lens_v, float time, ccl_addr_space Ray *ray) +ccl_device_inline void camera_sample(KernelGlobals *kg, + int x, int y, + float filter_u, float filter_v, + float lens_u, float lens_v, + float time, + ccl_addr_space Ray *ray) { /* pixel filter */ int filter_table_offset = kernel_data.film.filter_table_offset; diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 08f6f457805..fb5812ebcb8 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -36,7 +36,7 @@ /* Qualifier wrappers for different names on different devices */ #define ccl_device __device__ __inline__ -#define ccl_device_inline __device__ __inline__ +#define ccl_device_inline __device__ __forceinline__ #define ccl_device_noinline __device__ __noinline__ #define ccl_global #define ccl_constant diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index 93c4bd3f7d5..1e829eaa1fa 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -44,11 +44,11 @@ typedef struct LightSample { * * Note: light_p is modified when sample_coord is true. */ -ccl_device float area_light_sample(float3 P, - float3 *light_p, - float3 axisu, float3 axisv, - float randu, float randv, - bool sample_coord) +ccl_device_inline float area_light_sample(float3 P, + float3 *light_p, + float3 axisu, float3 axisv, + float randu, float randv, + bool sample_coord) { /* In our name system we're using P for the center, * which is o in the paper. @@ -268,11 +268,11 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(KernelGlobals return false; } -ccl_device float background_portal_pdf(KernelGlobals *kg, - float3 P, - float3 direction, - int ignore_portal, - bool *is_possible) +ccl_device_inline float background_portal_pdf(KernelGlobals *kg, + float3 P, + float3 direction, + int ignore_portal, + bool *is_possible) { float portal_pdf = 0.0f; @@ -367,7 +367,10 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg, return make_float3(0.0f, 0.0f, 0.0f); } -ccl_device float3 background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf) +ccl_device_inline float3 background_light_sample(KernelGlobals *kg, + float3 P, + float randu, float randv, + float *pdf) { /* Probability of sampling portals instead of the map. */ float portal_sampling_pdf = kernel_data.integrator.portal_pdf; @@ -507,8 +510,11 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 return t*t/cos_pi; } -ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp, - float randu, float randv, float3 P, LightSample *ls) +ccl_device_inline void lamp_light_sample(KernelGlobals *kg, + int lamp, + float randu, float randv, + float3 P, + LightSample *ls) { float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0); float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1); diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index d5b31037723..1f08f3459e6 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -436,7 +436,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, #ifdef __SUBSURFACE__ -ccl_device bool kernel_path_subsurface_scatter( +ccl_device_inline bool kernel_path_subsurface_scatter( KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index 74b1ae0ca32..250b8e92a45 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -222,8 +222,13 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_ #endif /* path tracing: bounce off or through surface to with new direction stored in ray */ -ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng, - ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray) +ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, + ccl_addr_space RNG *rng, + ShaderData *sd, + ccl_addr_space float3 *throughput, + ccl_addr_space PathState *state, + PathRadiance *L, + ccl_addr_space Ray *ray) { /* no BSDF? we can stop here */ if(ccl_fetch(sd, flag) & SD_BSDF) { diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index e45522a4641..5fd4f2fad4c 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -18,8 +18,14 @@ CCL_NAMESPACE_BEGIN #ifdef __VOLUME_SCATTER__ -ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng, - ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L) +ccl_device_inline void kernel_path_volume_connect_light( + KernelGlobals *kg, + RNG *rng, + ShaderData *sd, + ShaderData *emission_sd, + float3 throughput, + PathState *state, + PathRadiance *L) { #ifdef __EMISSION__ if(!kernel_data.integrator.use_direct_light) diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h index 8be6742699a..3437d83ed7d 100644 --- a/intern/cycles/kernel/kernel_projection.h +++ b/intern/cycles/kernel/kernel_projection.h @@ -130,7 +130,10 @@ ccl_device float2 direction_to_fisheye_equisolid(float3 dir, float lens, float w return make_float2(u, v); } -ccl_device float3 fisheye_equisolid_to_direction(float u, float v, float lens, float fov, float width, float height) +ccl_device_inline float3 fisheye_equisolid_to_direction(float u, float v, + float lens, + float fov, + float width, float height) { u = (u - 0.5f) * width; v = (v - 0.5f) * height; @@ -189,7 +192,7 @@ ccl_device float2 direction_to_mirrorball(float3 dir) return make_float2(u, v); } -ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v) +ccl_device_inline float3 panorama_to_direction(KernelGlobals *kg, float u, float v) { switch(kernel_data.cam.panorama_type) { case PANORAMA_EQUIRECTANGULAR: @@ -205,7 +208,7 @@ ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v) } } -ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir) +ccl_device_inline float2 direction_to_panorama(KernelGlobals *kg, float3 dir) { switch(kernel_data.cam.panorama_type) { case PANORAMA_EQUIRECTANGULAR: @@ -221,9 +224,9 @@ ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir) } } -ccl_device float3 spherical_stereo_position(KernelGlobals *kg, - float3 dir, - float3 pos) +ccl_device_inline float3 spherical_stereo_position(KernelGlobals *kg, + float3 dir, + float3 pos) { float interocular_offset = kernel_data.cam.interocular_offset; diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index b7641c37d93..bb3fe933b2c 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -149,8 +149,11 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, /* ShaderData setup from BSSRDF scatter */ #ifdef __SUBSURFACE__ -ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData *sd, - const Intersection *isect, const Ray *ray) +ccl_device void shader_setup_from_subsurface( + KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray) { bool backfacing = sd->flag & SD_BACKFACING; @@ -226,14 +229,14 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat /* ShaderData setup from position sampled on mesh */ -ccl_device void shader_setup_from_sample(KernelGlobals *kg, - ShaderData *sd, - const float3 P, - const float3 Ng, - const float3 I, - int shader, int object, int prim, - float u, float v, float t, - float time) +ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, + ShaderData *sd, + const float3 P, + const float3 Ng, + const float3 I, + int shader, int object, int prim, + float u, float v, float t, + float time) { /* vectors */ ccl_fetch(sd, P) = P; @@ -445,7 +448,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s /* Merging */ #if defined(__BRANCHED_PATH__) || defined(__VOLUME__) -ccl_device void shader_merge_closures(ShaderData *sd) +ccl_device_inline void shader_merge_closures(ShaderData *sd) { /* merge identical closures, better when we sample a single closure at a time */ for(int i = 0; i < sd->num_closure; i++) { @@ -554,9 +557,13 @@ ccl_device void shader_bsdf_eval(KernelGlobals *kg, } } -ccl_device int shader_bsdf_sample(KernelGlobals *kg, ShaderData *sd, - float randu, float randv, BsdfEval *bsdf_eval, - float3 *omega_in, differential3 *domega_in, float *pdf) +ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, + ShaderData *sd, + float randu, float randv, + BsdfEval *bsdf_eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) { int sampled = 0; @@ -991,8 +998,12 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData * /* Volume Evaluation */ -ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd, - PathState *state, VolumeStack *stack, int path_flag, ShaderContext ctx) +ccl_device_inline void shader_eval_volume(KernelGlobals *kg, + ShaderData *sd, + PathState *state, + VolumeStack *stack, + int path_flag, + ShaderContext ctx) { /* reset closures once at the start, we will be accumulating the closures * for all volumes in the stack into a single array of closures */ diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index 28fa826fde7..61073155e70 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -85,7 +85,11 @@ ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, Sha return NULL; } -ccl_device float3 subsurface_scatter_eval(ShaderData *sd, ShaderClosure *sc, float disk_r, float r, bool all) +ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, + ShaderClosure *sc, + float disk_r, + float r, + bool all) { #ifdef BSSRDF_MULTI_EVAL /* this is the veach one-sample model with balance heuristic, some pdf @@ -214,7 +218,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, /* Subsurface scattering step, from a point on the surface to other * nearby points on the same object. */ -ccl_device int subsurface_scatter_multi_intersect( +ccl_device_inline int subsurface_scatter_multi_intersect( KernelGlobals *kg, SubsurfaceIntersection* ss_isect, ShaderData *sd, diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index 01c87e6d89d..9dafed9afd1 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -36,7 +36,11 @@ typedef struct VolumeShaderCoefficients { } VolumeShaderCoefficients; /* evaluate shader to get extinction coefficient at P */ -ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *sd, PathState *state, float3 P, float3 *extinction) +ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg, + ShaderData *sd, + PathState *state, + float3 P, + float3 *extinction) { sd->P = P; shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); @@ -58,7 +62,11 @@ ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *s } /* evaluate shader to get absorption, scattering and emission at P */ -ccl_device bool volume_shader_sample(KernelGlobals *kg, ShaderData *sd, PathState *state, float3 P, VolumeShaderCoefficients *coeff) +ccl_device_inline bool volume_shader_sample(KernelGlobals *kg, + ShaderData *sd, + PathState *state, + float3 P, + VolumeShaderCoefficients *coeff) { sd->P = P; shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME); diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h index 6c557684099..ff92920c610 100644 --- a/intern/cycles/kernel/svm/svm_attribute.h +++ b/intern/cycles/kernel/svm/svm_attribute.h @@ -123,7 +123,10 @@ ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float * } } -ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { NodeAttributeType type, mesh_type; AttributeElement elem; diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index bb06254c3a9..7d512f7ff4d 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -18,7 +18,11 @@ CCL_NAMESPACE_BEGIN /* Geometry Node */ -ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device_inline void svm_node_geometry(KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint type, + uint out_offset) { float3 data; @@ -94,7 +98,11 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s /* Particle Info */ -ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device void svm_node_particle_info(KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint type, + uint out_offset) { switch(type) { case NODE_INFO_PAR_INDEX: { @@ -146,7 +154,11 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float /* Hair Info */ -ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device void svm_node_hair_info(KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint type, + uint out_offset) { float data; float3 data3; diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h index f959d90f309..368740f64c7 100644 --- a/intern/cycles/kernel/svm/svm_ramp.h +++ b/intern/cycles/kernel/svm/svm_ramp.h @@ -21,12 +21,12 @@ CCL_NAMESPACE_BEGIN /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */ -ccl_device float4 rgb_ramp_lookup(KernelGlobals *kg, - int offset, - float f, - bool interpolate, - bool extrapolate, - int table_size) +ccl_device_inline float4 rgb_ramp_lookup(KernelGlobals *kg, + int offset, + float f, + bool interpolate, + bool extrapolate, + int table_size) { if((f < 0.0f || f > 1.0f) && extrapolate) { float4 t0, dy; diff --git a/intern/cycles/kernel/svm/svm_ramp_util.h b/intern/cycles/kernel/svm/svm_ramp_util.h index 495d98cf250..9f2ce1276f9 100644 --- a/intern/cycles/kernel/svm/svm_ramp_util.h +++ b/intern/cycles/kernel/svm/svm_ramp_util.h @@ -21,11 +21,11 @@ CCL_NAMESPACE_BEGIN /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */ -ccl_device float3 rgb_ramp_lookup(const float3 *ramp, - float f, - bool interpolate, - bool extrapolate, - int table_size) +ccl_device_inline float3 rgb_ramp_lookup(const float3 *ramp, + float f, + bool interpolate, + bool extrapolate, + int table_size) { if ((f < 0.0f || f > 1.0f) && extrapolate) { float3 t0, dy; diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h index 276b6f26f5e..b39d6a3e009 100644 --- a/intern/cycles/kernel/svm/svm_tex_coord.h +++ b/intern/cycles/kernel/svm/svm_tex_coord.h @@ -99,12 +99,12 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, stack_store_float3(stack, out_offset, data); } -ccl_device_inline void svm_node_tex_coord_bump_dx(KernelGlobals *kg, - ShaderData *sd, - int path_flag, - float *stack, - uint4 node, - int *offset) +ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, + ShaderData *sd, + int path_flag, + float *stack, + uint4 node, + int *offset) { #ifdef __RAY_DIFFERENTIALS__ float3 data; @@ -184,12 +184,12 @@ ccl_device_inline void svm_node_tex_coord_bump_dx(KernelGlobals *kg, #endif } -ccl_device_inline void svm_node_tex_coord_bump_dy(KernelGlobals *kg, - ShaderData *sd, - int path_flag, - float *stack, - uint4 node, - int *offset) +ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, + ShaderData *sd, + int path_flag, + float *stack, + uint4 node, + int *offset) { #ifdef __RAY_DIFFERENTIALS__ float3 data; diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 016f4a6a794..13aba0646d2 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -1477,10 +1477,10 @@ ccl_device bool ray_triangle_intersect( return true; } -ccl_device bool ray_triangle_intersect_uv( - float3 ray_P, float3 ray_D, float ray_t, - float3 v0, float3 v1, float3 v2, - float *isect_u, float *isect_v, float *isect_t) +ccl_device_inline bool ray_triangle_intersect_uv( + float3 ray_P, float3 ray_D, float ray_t, + float3 v0, float3 v1, float3 v2, + float *isect_u, float *isect_v, float *isect_t) { /* Calculate intersection */ float3 e1 = v1 - v0; |