diff options
author | Patrick Mours <pmours@nvidia.com> | 2019-08-22 18:36:54 +0300 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2019-08-26 11:26:53 +0300 |
commit | f491c23f1e104998752380b930307e7abc4597b3 (patch) | |
tree | 7dcc876468baf526d4933fe0bfab6eed2f23cf4c | |
parent | 2b999c6a68f85523c46f39bb6a877baba2343d9b (diff) |
Cycles: inline more functions on the GPU
This makes little difference for CUDA and OpenCL, but will be helpful
for Optix.
-rw-r--r-- | intern/cycles/kernel/kernel_compat_cuda.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_compat_opencl.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_emission.h | 54 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_light.h | 20 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_path_branched.h | 16 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_path_surface.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_path_volume.h | 18 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_volume.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_attribute.h | 16 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_brick.h | 20 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_color_util.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_magic.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_musgrave.h | 19 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_noise.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_wave.h | 12 | ||||
-rw-r--r-- | intern/cycles/util/util_defines.h | 1 |
16 files changed, 80 insertions, 108 deletions
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 469b81d120b..5075c434b10 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -58,6 +58,7 @@ __device__ half __float2half(const float f) # define ccl_device_forceinline __device__ __forceinline__ #endif #define ccl_device_noinline __device__ __noinline__ +#define ccl_device_noinline_cpu ccl_device #define ccl_global #define ccl_static_constant __constant__ #define ccl_constant const diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index e040ea88d7c..1fe52c51ab0 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -35,6 +35,7 @@ #define ccl_device_inline ccl_device #define ccl_device_forceinline ccl_device #define ccl_device_noinline ccl_device ccl_noinline +#define ccl_device_noinline_cpu ccl_device #define ccl_may_alias #define ccl_static_constant static __constant #define ccl_constant __constant diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index be0a2bd2d6b..16d52b0c733 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -17,14 +17,14 @@ CCL_NAMESPACE_BEGIN /* Direction Emission */ -ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, - ShaderData *emission_sd, - LightSample *ls, - ccl_addr_space PathState *state, - float3 I, - differential3 dI, - float t, - float time) +ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg, + ShaderData *emission_sd, + LightSample *ls, + ccl_addr_space PathState *state, + float3 I, + differential3 dI, + float t, + float time) { /* setup shading at emitter */ float3 eval = make_float3(0.0f, 0.0f, 0.0f); @@ -98,15 +98,15 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, return eval; } -ccl_device_noinline bool direct_emission(KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - LightSample *ls, - ccl_addr_space PathState *state, - Ray *ray, - BsdfEval *eval, - bool *is_lamp, - float rand_terminate) +ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg, + ShaderData *sd, + ShaderData *emission_sd, + LightSample *ls, + ccl_addr_space PathState *state, + Ray *ray, + BsdfEval *eval, + bool *is_lamp, + float rand_terminate) { if (ls->pdf == 0.0f) return false; @@ -208,7 +208,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, /* Indirect Primitive Emission */ -ccl_device_noinline float3 indirect_primitive_emission( +ccl_device_noinline_cpu float3 indirect_primitive_emission( KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf) { /* evaluate emissive closure */ @@ -234,11 +234,11 @@ ccl_device_noinline float3 indirect_primitive_emission( /* Indirect Lamp Emission */ -ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, - ShaderData *emission_sd, - ccl_addr_space PathState *state, - Ray *ray, - float3 *emission) +ccl_device_noinline_cpu bool indirect_lamp_emission(KernelGlobals *kg, + ShaderData *emission_sd, + ccl_addr_space PathState *state, + Ray *ray, + float3 *emission) { bool hit_lamp = false; @@ -293,10 +293,10 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, /* Indirect Background */ -ccl_device_noinline float3 indirect_background(KernelGlobals *kg, - ShaderData *emission_sd, - ccl_addr_space PathState *state, - ccl_addr_space Ray *ray) +ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg, + ShaderData *emission_sd, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray) { #ifdef __BACKGROUND__ int shader = kernel_data.background.surface_shader; diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index 9128bfa9d95..758e91159b6 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -182,17 +182,7 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 #ifdef __BACKGROUND_MIS__ -/* TODO(sergey): In theory it should be all fine to use noinline for all - * devices, but we're so close to the release so better not screw things - * up for CPU at least. - */ -# ifdef __KERNEL_GPU__ -ccl_device_noinline -# else -ccl_device -# endif - float3 - background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf) +ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf) { /* for the following, the CDF values are actually a pair of floats, with the * function value as X and the actual CDF as Y. The last entry's function @@ -274,13 +264,7 @@ ccl_device /* TODO(sergey): Same as above, after the release we should consider using * 'noinline' for all devices. */ -# ifdef __KERNEL_GPU__ -ccl_device_noinline -# else -ccl_device -# endif - float - background_map_pdf(KernelGlobals *kg, float3 direction) +ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction) { float2 uv = direction_to_equirectangular(direction); int res_x = kernel_data.integrator.pdf_background_res_x; diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index 737a7c4aa84..ea6b23e7eb4 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -198,14 +198,14 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg, # endif /* __VOLUME__ */ /* bounce off surface and integrate indirect light */ -ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, - ShaderData *sd, - ShaderData *indirect_sd, - ShaderData *emission_sd, - float3 throughput, - float num_samples_adjust, - PathState *state, - PathRadiance *L) +ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, + ShaderData *sd, + ShaderData *indirect_sd, + ShaderData *emission_sd, + float3 throughput, + float num_samples_adjust, + PathState *state, + PathRadiance *L) { float sum_sample_weight = 0.0f; # ifdef __DENOISING_FEATURES__ diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index a1ab4951565..07444a98d82 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -20,7 +20,7 @@ CCL_NAMESPACE_BEGIN defined(__BAKING__) /* branched path tracing: connect path directly to position on one or more lights and add it to L */ -ccl_device_noinline void kernel_branched_path_surface_connect_light( +ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light( KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index fea4dfc159d..82975c2ad26 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -57,18 +57,12 @@ ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg, # endif /* __EMISSION__ */ } -# ifdef __KERNEL_GPU__ -ccl_device_noinline -# else -ccl_device -# endif - bool - kernel_path_volume_bounce(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space float3 *throughput, - ccl_addr_space PathState *state, - PathRadianceState *L_state, - ccl_addr_space Ray *ray) +ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space float3 *throughput, + ccl_addr_space PathState *state, + PathRadianceState *L_state, + ccl_addr_space Ray *ray) { /* sample phase function */ float phase_pdf; diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index b0b67efc7b2..2705526abe4 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -672,7 +672,7 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg, * ray, with the assumption that there are no surfaces blocking light * between the endpoints. distance sampling is used to decide if we will * scatter or not. */ -ccl_device_noinline VolumeIntegrateResult +ccl_device_noinline_cpu VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg, ccl_addr_space PathState *state, ShaderData *sd, diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h index 341eaad28ee..eaee0f9e4ee 100644 --- a/intern/cycles/kernel/svm/svm_attribute.h +++ b/intern/cycles/kernel/svm/svm_attribute.h @@ -80,13 +80,7 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u } } -#ifndef __KERNEL_CUDA__ -ccl_device -#else -ccl_device_noinline -#endif - void - svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) { NodeAttributeType type = NODE_ATTR_FLOAT; uint out_offset = 0; @@ -125,13 +119,7 @@ ccl_device_noinline } } -#ifndef __KERNEL_CUDA__ -ccl_device -#else -ccl_device_noinline -#endif - void - svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) { NodeAttributeType type = NODE_ATTR_FLOAT; uint out_offset = 0; diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h index f1d74b7df96..6984afa30a5 100644 --- a/intern/cycles/kernel/svm/svm_brick.h +++ b/intern/cycles/kernel/svm/svm_brick.h @@ -27,16 +27,16 @@ ccl_device_inline float brick_noise(uint n) /* fast integer noise */ return 0.5f * ((float)nn / 1073741824.0f); } -ccl_device_noinline float2 svm_brick(float3 p, - float mortar_size, - float mortar_smooth, - float bias, - float brick_width, - float row_height, - float offset_amount, - int offset_frequency, - float squash_amount, - int squash_frequency) +ccl_device_noinline_cpu float2 svm_brick(float3 p, + float mortar_size, + float mortar_smooth, + float bias, + float brick_width, + float row_height, + float offset_amount, + int offset_frequency, + float squash_amount, + int squash_frequency) { int bricknum, rownum; float offset = 0.0f; diff --git a/intern/cycles/kernel/svm/svm_color_util.h b/intern/cycles/kernel/svm/svm_color_util.h index 12b59d2616b..3a6a5ba782f 100644 --- a/intern/cycles/kernel/svm/svm_color_util.h +++ b/intern/cycles/kernel/svm/svm_color_util.h @@ -264,7 +264,7 @@ ccl_device float3 svm_mix_clamp(float3 col) return outcol; } -ccl_device_noinline float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2) +ccl_device_noinline_cpu float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2) { float t = saturate(fac); diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h index 6ba1a5817ad..9c160e6d8cc 100644 --- a/intern/cycles/kernel/svm/svm_magic.h +++ b/intern/cycles/kernel/svm/svm_magic.h @@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN /* Magic */ -ccl_device_noinline float3 svm_magic(float3 p, int n, float distortion) +ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion) { float x = sinf((p.x + p.y + p.z) * 5.0f); float y = cosf((-p.x + p.y - p.z) * 5.0f); diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h index 298155b795f..9291c7e7295 100644 --- a/intern/cycles/kernel/svm/svm_musgrave.h +++ b/intern/cycles/kernel/svm/svm_musgrave.h @@ -25,7 +25,10 @@ CCL_NAMESPACE_BEGIN * from "Texturing and Modelling: A procedural approach" */ -ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity, float octaves) +ccl_device_noinline_cpu float noise_musgrave_fBm(float3 p, + float H, + float lacunarity, + float octaves) { float rmd; float value = 0.0f; @@ -53,10 +56,10 @@ ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity * octaves: number of frequencies in the fBm */ -ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, - float H, - float lacunarity, - float octaves) +ccl_device_noinline_cpu float noise_musgrave_multi_fractal(float3 p, + float H, + float lacunarity, + float octaves) { float rmd; float value = 1.0f; @@ -85,7 +88,7 @@ ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, * offset: raises the terrain from `sea level' */ -ccl_device_noinline float noise_musgrave_hetero_terrain( +ccl_device_noinline_cpu float noise_musgrave_hetero_terrain( float3 p, float H, float lacunarity, float octaves, float offset) { float value, increment, rmd; @@ -121,7 +124,7 @@ ccl_device_noinline float noise_musgrave_hetero_terrain( * offset: raises the terrain from `sea level' */ -ccl_device_noinline float noise_musgrave_hybrid_multi_fractal( +ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal( float3 p, float H, float lacunarity, float octaves, float offset, float gain) { float result, signal, weight, rmd; @@ -159,7 +162,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal( * offset: raises the terrain from `sea level' */ -ccl_device_noinline float noise_musgrave_ridged_multi_fractal( +ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal( float3 p, float H, float lacunarity, float octaves, float offset, float gain) { float result, signal, weight; diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h index 0bf3dfda4df..dd375af27e5 100644 --- a/intern/cycles/kernel/svm/svm_noise.h +++ b/intern/cycles/kernel/svm/svm_noise.h @@ -182,7 +182,7 @@ ccl_device_inline ssef scale3_sse(const ssef &result) #endif #ifndef __KERNEL_SSE2__ -ccl_device_noinline float perlin(float x, float y, float z) +ccl_device_noinline_cpu float perlin(float x, float y, float z) { int X; float fx = floorfrac(x, &X); diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h index 03b7f330970..baaa89ab0cb 100644 --- a/intern/cycles/kernel/svm/svm_wave.h +++ b/intern/cycles/kernel/svm/svm_wave.h @@ -18,12 +18,12 @@ CCL_NAMESPACE_BEGIN /* Wave */ -ccl_device_noinline float svm_wave(NodeWaveType type, - NodeWaveProfile profile, - float3 p, - float detail, - float distortion, - float dscale) +ccl_device_noinline_cpu float svm_wave(NodeWaveType type, + NodeWaveProfile profile, + float3 p, + float detail, + float distortion, + float dscale) { float n; diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h index 7f3bead0a18..760985447a8 100644 --- a/intern/cycles/util/util_defines.h +++ b/intern/cycles/util/util_defines.h @@ -30,6 +30,7 @@ # ifndef __KERNEL_GPU__ # define ccl_device static inline # define ccl_device_noinline static +# define ccl_device_noinline_cpu ccl_device_noinline # define ccl_global # define ccl_static_constant static const # define ccl_constant const |