diff options
author | Sergey Sharybin <sergey.vfx@gmail.com> | 2016-05-17 13:30:46 +0300 |
---|---|---|
committer | Sergey Sharybin <sergey.vfx@gmail.com> | 2016-05-18 11:14:24 +0300 |
commit | 7b356a856540a1affa5dc85360183418e6337a5a (patch) | |
tree | 9acee7019c696f694c97d504e1a2fe678a7f0cd1 /intern/cycles/device/device_cpu.cpp | |
parent | 2433a537fa12dad6cc8a1c323b1b73e5cad6cd4d (diff) |
Cycles: Reduce amount of malloc() calls from the kernel
This commit makes it so malloc() is only happening once per volume and
once per transparent shadow query (per thread), improving scalability of
the code to multiple CPU cores.
Hard to measure this with a low-bottom i7 here currently, but from quick
tests seems volume sampling gave about 3-5% speedup.
The idea is to store allocated memory in kernel globals, which are per
thread on CPU already.
Reviewers: dingto, juicyfruit, lukasstockner97, maiself, brecht
Reviewed By: brecht
Subscribers: Blendify, nutel
Differential Revision: https://developer.blender.org/D1996
Diffstat (limited to 'intern/cycles/device/device_cpu.cpp')
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 45 |
1 files changed, 36 insertions, 9 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 676b1279a80..275ee028eb4 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -213,12 +213,7 @@ public: return; } - KernelGlobals kg = kernel_globals; - -#ifdef WITH_OSL - OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); -#endif - + KernelGlobals kg = thread_kernel_globals_init(); RenderTile tile; void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int); @@ -289,9 +284,7 @@ public: } } -#ifdef WITH_OSL - OSLShader::thread_free(&kg); -#endif + thread_kernel_globals_free(&kg); } void thread_film_convert(DeviceTask& task) @@ -481,6 +474,40 @@ public: { task_pool.cancel(); } + +protected: + inline KernelGlobals thread_kernel_globals_init() + { + KernelGlobals kg = kernel_globals; + kg.transparent_shadow_intersections = NULL; + const int decoupled_count = sizeof(kg.decoupled_volume_steps) / + sizeof(*kg.decoupled_volume_steps); + for(int i = 0; i < decoupled_count; ++i) { + kg.decoupled_volume_steps[i] = NULL; + } + kg.decoupled_volume_steps_index = 0; +#ifdef WITH_OSL + OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); +#endif + return kg; + } + + inline void thread_kernel_globals_free(KernelGlobals *kg) + { + if(kg->transparent_shadow_intersections != NULL) { + free(kg->transparent_shadow_intersections); + } + const int decoupled_count = sizeof(kg->decoupled_volume_steps) / + sizeof(*kg->decoupled_volume_steps); + for(int i = 0; i < decoupled_count; ++i) { + if(kg->decoupled_volume_steps[i] != NULL) { + free(kg->decoupled_volume_steps[i]); + } + } +#ifdef WITH_OSL + OSLShader::thread_free(kg); +#endif + } }; Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) |