Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorSergey Sharybin <sergey.vfx@gmail.com>2016-09-21 18:46:25 +0300
committerSergey Sharybin <sergey.vfx@gmail.com>2017-02-08 16:00:48 +0300
commitdde40989f34634f43fb561416728c438dfb62f0b (patch)
tree024268c5b5dbc9a858850bb7a361570df3b042fa /intern
parent7447950bc3a3b11f0f1e0fd55df2031dbd3c0be2 (diff)
Cycles: Store shadow intersections in the kernel globals
Seems CUDA failed to de-duplicate the array across multiple inlined versions of the shadow_blocked(). Helped it a bit with that now. Gives about 100MB memory improvement on a scenes after previous commit and brings up memory "regression" to only 100MB comparing to the master branch now.
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/kernel/kernel_globals.h5
-rw-r--r--intern/cycles/kernel/kernel_shadow.h5
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel.cu27
3 files changed, 27 insertions, 10 deletions
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 8e66a3a0340..2b52a2d2f48 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -76,7 +76,10 @@ typedef struct KernelGlobals {
#ifdef __KERNEL_CUDA__
__constant__ KernelData __data;
-typedef struct KernelGlobals {} KernelGlobals;
+typedef struct KernelGlobals {
+ /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+ Intersection hits_stack[64];
+} KernelGlobals;
# ifdef __KERNEL_CUDA_TEX_STORAGE__
# define KERNEL_TEX(type, ttype, name) ttype name;
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 05a6c7d1827..e69eac6ab83 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -109,8 +109,12 @@ ccl_device_inline bool shadow_blocked_all(KernelGlobals *kg,
/* Intersect to find an opaque surface, or record all transparent
* surface hits.
*/
+#ifdef __KERNEL_CUDA__
+ Intersection *hits = kg->hits_stack;
+#else
Intersection hits_stack[SHADOW_STACK_MAX_HITS];
Intersection *hits = hits_stack;
+#endif
const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
#ifndef __KERNEL_GPU__
@@ -247,6 +251,7 @@ ccl_device_noinline bool shadow_blocked_stepped(KernelGlobals *kg,
for(;;) {
if(bounce >= kernel_data.integrator.transparent_max_bounce) {
return true;
+ }
if(!scene_intersect(kg,
*ray,
PATH_RAY_SHADOW_TRANSPARENT,
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index eb2b6ea5414..090ab2c50c2 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -130,8 +130,10 @@ kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int s
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
- if(x < sx + sw && y < sy + sh)
- kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+ if(x < sx + sw && y < sy + sh) {
+ KernelGlobals kg;
+ kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
+ }
}
#ifdef __BRANCHED_PATH__
@@ -142,8 +144,10 @@ kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
- if(x < sx + sw && y < sy + sh)
- kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+ if(x < sx + sw && y < sy + sh) {
+ KernelGlobals kg;
+ kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
+ }
}
#endif
@@ -154,8 +158,9 @@ kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
- if(x < sx + sw && y < sy + sh)
+ if(x < sx + sw && y < sy + sh) {
kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+ }
}
extern "C" __global__ void
@@ -165,8 +170,9 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
- if(x < sx + sw && y < sy + sh)
+ if(x < sx + sw && y < sy + sh) {
kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+ }
}
extern "C" __global__ void
@@ -183,7 +189,8 @@ kernel_cuda_shader(uint4 *input,
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
if(x < sx + sw) {
- kernel_shader_evaluate(NULL,
+ KernelGlobals kg;
+ kernel_shader_evaluate(&kg,
input,
output,
output_luma,
@@ -200,8 +207,10 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
- if(x < sx + sw)
- kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+ if(x < sx + sw) {
+ KernelGlobals kg;
+ kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+ }
}
#endif