diff options
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 45 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_globals.h | 11 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_shadow.h | 18 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_volume.h | 30 |
4 files changed, 83 insertions, 21 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 676b1279a80..275ee028eb4 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -213,12 +213,7 @@ public: return; } - KernelGlobals kg = kernel_globals; - -#ifdef WITH_OSL - OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); -#endif - + KernelGlobals kg = thread_kernel_globals_init(); RenderTile tile; void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int); @@ -289,9 +284,7 @@ public: } } -#ifdef WITH_OSL - OSLShader::thread_free(&kg); -#endif + thread_kernel_globals_free(&kg); } void thread_film_convert(DeviceTask& task) @@ -481,6 +474,40 @@ public: { task_pool.cancel(); } + +protected: + inline KernelGlobals thread_kernel_globals_init() + { + KernelGlobals kg = kernel_globals; + kg.transparent_shadow_intersections = NULL; + const int decoupled_count = sizeof(kg.decoupled_volume_steps) / + sizeof(*kg.decoupled_volume_steps); + for(int i = 0; i < decoupled_count; ++i) { + kg.decoupled_volume_steps[i] = NULL; + } + kg.decoupled_volume_steps_index = 0; +#ifdef WITH_OSL + OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); +#endif + return kg; + } + + inline void thread_kernel_globals_free(KernelGlobals *kg) + { + if(kg->transparent_shadow_intersections != NULL) { + free(kg->transparent_shadow_intersections); + } + const int decoupled_count = sizeof(kg->decoupled_volume_steps) / + sizeof(*kg->decoupled_volume_steps); + for(int i = 0; i < decoupled_count; ++i) { + if(kg->decoupled_volume_steps[i] != NULL) { + free(kg->decoupled_volume_steps[i]); + } + } +#ifdef WITH_OSL + OSLShader::thread_free(kg); +#endif + } }; Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index c44ea1b051f..7e6cdf93fb9 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -31,6 +31,9 @@ struct OSLThreadData; struct OSLShadingSystem; # endif +struct Intersection; +struct VolumeStep; + typedef struct KernelGlobals { texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_IMAGES_CPU]; texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_IMAGES_CPU]; @@ -51,6 +54,14 @@ typedef struct KernelGlobals { OSLThreadData *osl_tdata; # endif + /* **** Run-time data **** */ + + /* Heap-allocated storage for transparent shadows intersections. */ + Intersection *transparent_shadow_intersections; + + /* Storage for decoupled volume steps. */ + VolumeStep *decoupled_volume_steps[2]; + int decoupled_volume_steps_index; } KernelGlobals; #endif /* __KERNEL_CPU__ */ diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index 3b1111e5069..504ac2e40bc 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -59,14 +59,20 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * /* intersect to find an opaque surface, or record all transparent surface hits */ Intersection hits_stack[STACK_MAX_HITS]; Intersection *hits = hits_stack; - uint max_hits = kernel_data.integrator.transparent_max_bounce - state->transparent_bounce - 1; + const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; + uint max_hits = transparent_max_bounce - state->transparent_bounce - 1; /* prefer to use stack but use dynamic allocation if too deep max hits * we need max_hits + 1 storage space due to the logic in * scene_intersect_shadow_all which will first store and then check if * the limit is exceeded */ - if(max_hits + 1 > STACK_MAX_HITS) - hits = (Intersection*)malloc(sizeof(Intersection)*(max_hits + 1)); + if(max_hits + 1 > STACK_MAX_HITS) { + if(kg->transparent_shadow_intersections == NULL) { + kg->transparent_shadow_intersections = + (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1)); + } + hits = kg->transparent_shadow_intersections; + } uint num_hits; blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits); @@ -147,14 +153,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * *shadow = throughput; - if(hits != hits_stack) - free(hits); return is_zero(throughput); } - - /* free dynamic storage */ - if(hits != hits_stack) - free(hits); } else { Intersection isect; diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index c499773b980..224c275b03d 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -627,12 +627,30 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta step_size = kernel_data.integrator.volume_step_size; /* compute exact steps in advance for malloc */ max_steps = max((int)ceilf(ray->t/step_size), 1); + /* NOTE: For the branched path tracing it's possible to have direct + * and indirect light integration both having volume segments allocated. + * We detect this using index in the pre-allocated memory. Currently we + * only support two segments allocated at a time, if more needed some + * modifications to the KernelGlobals will be needed. + * + * This gives us restrictions that decoupled record should only happen + * in the stack manner, meaning if there's subsequent call of decoupled + * record it'll need to free memory before it's caller frees memory. + */ + const int index = kg->decoupled_volume_steps_index; + assert(index < sizeof(kg->decoupled_volume_steps) / + sizeof(*kg->decoupled_volume_steps)); if(max_steps > global_max_steps) { max_steps = global_max_steps; step_size = ray->t / (float)max_steps; } - segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps); + if(kg->decoupled_volume_steps[index] == NULL) { + kg->decoupled_volume_steps[index] = + (VolumeStep*)malloc(sizeof(VolumeStep)*global_max_steps); + } + segment->steps = kg->decoupled_volume_steps[index]; random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size; + ++kg->decoupled_volume_steps_index; } else { max_steps = 1; @@ -745,8 +763,14 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment) { - if(segment->steps != &segment->stack_step) - free(segment->steps); + if(segment->steps != &segment->stack_step) { + /* NOTE: We only allow free last allocated segment. + * No random order of alloc/free is supported. + */ + assert(kg->decoupled_volume_steps_index > 0); + assert(segment->steps == kg->decoupled_volume_steps[kg->decoupled_volume_steps_index - 1]); + --kg->decoupled_volume_steps_index; + } } /* scattering for homogeneous and heterogeneous volumes, using decoupled ray |