diff options
26 files changed, 837 insertions, 36 deletions
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 75c7dcee05e..30ae0bc813d 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -516,7 +516,7 @@ void BlenderSession::render() if(!b_engine.is_preview() && background && print_render_stats) { RenderStats stats; - session->scene->collect_statistics(&stats); + session->collect_statistics(&stats); printf("Render statistics:\n%s\n", stats.full_report().c_str()); } diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 832847c179f..49b046d0a88 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -874,7 +874,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, } /* tiles */ - if(params.device.type != DEVICE_CPU && !background) { + const bool is_cpu = (params.device.type == DEVICE_CPU); + if(!is_cpu && !background) { /* currently GPU could be much slower than CPU when using tiles, * still need to be investigated, but meanwhile make it possible * to work in viewport smoothly @@ -960,6 +961,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, params.progressive_update_timeout = 0.1; } + params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && + background && BlenderSession::print_render_stats; + return params; } diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 7e20bb449c3..54ffd4bc4df 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -362,6 +362,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th info.has_half_images = true; info.has_volume_decoupled = true; info.has_osl = true; + info.has_profiling = true; foreach(const DeviceInfo &device, subdevices) { /* Ensure CPU device does not slow down GPU. */ @@ -396,6 +397,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th info.has_half_images &= device.has_half_images; info.has_volume_decoupled &= device.has_volume_decoupled; info.has_osl &= device.has_osl; + info.has_profiling &= device.has_profiling; } return info; diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index f3fb338e638..071f61a7566 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -60,6 +60,7 @@ public: bool has_volume_decoupled; /* Decoupled volume shading. */ bool has_osl; /* Support Open Shading Language. */ bool use_split_kernel; /* Use split or mega kernel. */ + bool has_profiling; /* Supports runtime collection of profiling info. */ int cpu_threads; vector<DeviceInfo> multi_devices; @@ -75,6 +76,7 @@ public: has_volume_decoupled = false; has_osl = false; use_split_kernel = false; + has_profiling = false; } bool operator==(const DeviceInfo &info) { diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 36fe9bfc92b..f0a6fd6e3f4 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -477,6 +477,8 @@ public: bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, DenoisingTask *task) { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS); + int4 rect = task->rect; int r = task->nlm_state.r; int f = task->nlm_state.f; @@ -529,6 +531,8 @@ public: bool denoising_construct_transform(DenoisingTask *task) { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM); + for(int y = 0; y < task->filter_area.w; y++) { for(int x = 0; x < task->filter_area.z; x++) { filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, @@ -551,6 +555,8 @@ public: device_ptr output_ptr, DenoisingTask *task) { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); + mem_zero(task->storage.XtWX); mem_zero(task->storage.XtWY); @@ -609,8 +615,10 @@ public: bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, device_ptr variance_ptr, - int r, int4 rect, DenoisingTask * /*task*/) + int r, int4 rect, DenoisingTask *task) { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES); + for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x; x < rect.z; x++) { filter_combine_halves_kernel()(x, y, @@ -629,6 +637,8 @@ public: device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, device_ptr buffer_variance_ptr, DenoisingTask *task) { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW); + for(int y = task->rect.y; y < task->rect.w; y++) { for(int x = task->rect.x; x < task->rect.z; x++) { filter_divide_shadow_kernel()(task->render_buffer.samples, @@ -653,6 +663,8 @@ public: device_ptr variance_ptr, DenoisingTask *task) { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE); + for(int y = task->rect.y; y < task->rect.w; y++) { for(int x = task->rect.x; x < task->rect.z; x++) { filter_get_feature_kernel()(task->render_buffer.samples, @@ -676,6 +688,8 @@ public: device_ptr output_ptr, DenoisingTask *task) { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS); + for(int y = task->rect.y; y < task->rect.w; y++) { for(int x = task->rect.x; x < task->rect.z; x++) { filter_detect_outliers_kernel()(x, y, @@ -735,6 +749,8 @@ public: void denoise(DenoisingTask& denoising, RenderTile &tile) { + ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); + tile.sample = tile.start_sample + tile.num_samples; denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); @@ -765,6 +781,8 @@ public: KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init()); + stats.profiler.add_state(&kg->profiler); + CPUSplitKernel *split_kernel = NULL; if(use_split_kernel) { split_kernel = new CPUSplitKernel(this); @@ -778,6 +796,7 @@ public: RenderTile tile; DenoisingTask denoising(this, task); + denoising.profiler = &kg->profiler; while(task.acquire_tile(this, tile)) { if(tile.task == RenderTile::PATH_TRACE) { @@ -802,6 +821,8 @@ public: } } + stats.profiler.remove_state(&kg->profiler); + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); kg->~KernelGlobals(); kgbuffer.free(); @@ -1061,6 +1082,7 @@ void device_cpu_info(vector<DeviceInfo>& devices) info.has_volume_decoupled = true; info.has_osl = true; info.has_half_images = true; + info.has_profiling = true; devices.insert(devices.begin(), info); } diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 78c65a3d22d..433cbd3c265 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -22,6 +22,7 @@ CCL_NAMESPACE_BEGIN DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE), + profiler(NULL), storage(device), buffer(device), device(device) diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index 8e0666d0e59..beae60c220f 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -23,6 +23,8 @@ #include "kernel/filter/filter_defines.h" +#include "util/util_profiling.h" + CCL_NAMESPACE_BEGIN class DenoisingTask { @@ -51,6 +53,8 @@ public: TileInfo *tile_info; device_vector<int> tile_info_mem; + ProfilingState *profiler; + int4 rect; int4 filter_area; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 92cb66bdec9..d4145225b77 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -110,6 +110,7 @@ set(SRC_HEADERS kernel_path_surface.h kernel_path_subsurface.h kernel_path_volume.h + kernel_profiling.h kernel_projection.h kernel_queues.h kernel_random.h diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 6708a3efac1..284b1e9208c 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -186,6 +186,8 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, float difl, float extmax) { + PROFILING_INIT(kg, PROFILING_INTERSECT); + if(!scene_intersect_valid(&ray)) { return false; } @@ -248,6 +250,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg, uint *lcg_state, int max_hits) { + PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL); + if(!scene_intersect_valid(&ray)) { return false; } @@ -327,6 +331,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, uint max_hits, uint *num_hits) { + PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL); + if(!scene_intersect_valid(ray)) { return false; } @@ -407,6 +413,8 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg, Intersection *isect, const uint visibility) { + PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME); + if(!scene_intersect_valid(ray)) { return false; } @@ -438,6 +446,8 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg, const uint max_hits, const uint visibility) { + PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL); + if(!scene_intersect_valid(ray)) { return false; } diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index 37402f42863..59f1e252d21 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -19,6 +19,8 @@ #ifndef __KERNEL_GLOBALS_H__ #define __KERNEL_GLOBALS_H__ +#include "kernel/kernel_profiling.h" + #ifdef __KERNEL_CPU__ # include "util/util_vector.h" # include "util/util_map.h" @@ -82,6 +84,8 @@ typedef struct KernelGlobals { int2 global_size; int2 global_id; + + ProfilingState profiler; } KernelGlobals; #endif /* __KERNEL_CPU__ */ diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 1f5929e4938..08e9db05c39 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -376,6 +376,9 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, int sample, PathRadiance *L) { + PROFILING_INIT(kg, PROFILING_WRITE_RESULT); + PROFILING_OBJECT(PRIM_NONE); + float alpha; float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha); diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index cb1f410b09f..a1fc6028293 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -57,6 +57,8 @@ ccl_device_forceinline bool kernel_path_scene_intersect( Intersection *isect, PathRadiance *L) { + PROFILING_INIT(kg, PROFILING_SCENE_INTERSECT); + uint visibility = path_state_ray_visibility(kg, state); if(path_state_ao_bounce(kg, state)) { @@ -105,6 +107,8 @@ ccl_device_forceinline void kernel_path_lamp_emission( ShaderData *emission_sd, PathRadiance *L) { + PROFILING_INIT(kg, PROFILING_INDIRECT_EMISSION); + #ifdef __LAMP_MIS__ if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { /* ray starting from previous non-transparent bounce */ @@ -172,6 +176,8 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume( ShaderData *emission_sd, PathRadiance *L) { + PROFILING_INIT(kg, PROFILING_VOLUME); + /* Sanitize volume stack. */ if(!hit) { kernel_volume_clean_stack(kg, state->volume_stack); @@ -278,6 +284,8 @@ ccl_device_forceinline bool kernel_path_shader_apply( PathRadiance *L, ccl_global float *buffer) { + PROFILING_INIT(kg, PROFILING_SHADER_APPLY); + #ifdef __SHADOW_TRICKS__ if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) { @@ -355,6 +363,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, float3 throughput, float3 ao_alpha) { + PROFILING_INIT(kg, PROFILING_AO); + /* todo: solve correlation */ float bsdf_u, bsdf_v; @@ -568,6 +578,8 @@ ccl_device_forceinline void kernel_path_integrate( ccl_global float *buffer, ShaderData *emission_sd) { + PROFILING_INIT(kg, PROFILING_PATH_INTEGRATE); + /* Shader data memory used for both volumes and surfaces, saves stack space. */ ShaderData sd; @@ -719,6 +731,8 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride) { + PROFILING_INIT(kg, PROFILING_RAY_SETUP); + /* buffer offset */ int index = offset + x + y*stride; int pass_stride = kernel_data.film.pass_stride; diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h index ff4a8a9d580..962776f21c1 100644 --- a/intern/cycles/kernel/kernel_path_subsurface.h +++ b/intern/cycles/kernel/kernel_path_subsurface.h @@ -32,6 +32,8 @@ bool kernel_path_subsurface_scatter( ccl_addr_space float3 *throughput, ccl_addr_space SubsurfaceIndirectRays *ss_indirect) { + PROFILING_INIT(kg, PROFILING_SUBSURFACE); + float bssrdf_u, bssrdf_v; path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index c9202ccb16d..0d18a1e8c77 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -217,6 +217,8 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L) { + PROFILING_INIT(kg, PROFILING_CONNECT_LIGHT); + #ifdef __EMISSION__ if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) return; @@ -274,6 +276,8 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, PathRadianceState *L_state, ccl_addr_space Ray *ray) { + PROFILING_INIT(kg, PROFILING_SURFACE_BOUNCE); + /* no BSDF? we can stop here */ if(sd->flag & SD_BSDF) { /* sample BSDF */ diff --git a/intern/cycles/kernel/kernel_profiling.h b/intern/cycles/kernel/kernel_profiling.h new file mode 100644 index 00000000000..a46d6376473 --- /dev/null +++ b/intern/cycles/kernel/kernel_profiling.h @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2018 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_PROFILING_H__ +#define __KERNEL_PROFILING_H__ + +#ifdef __KERNEL_CPU__ +# include "util/util_profiling.h" +#endif + +CCL_NAMESPACE_BEGIN + +#ifdef __KERNEL_CPU__ +# define PROFILING_INIT(kg, event) ProfilingHelper profiling_helper(&kg->profiler, event) +# define PROFILING_EVENT(event) profiling_helper.set_event(event) +# define PROFILING_SHADER(shader) if((shader) != SHADER_NONE) { profiling_helper.set_shader((shader) & SHADER_MASK); } +# define PROFILING_OBJECT(object) if((object) != PRIM_NONE) { profiling_helper.set_object(object); } +#else +# define PROFILING_INIT(kg, event) +# define PROFILING_EVENT(event) +# define PROFILING_SHADER(shader) +# define PROFILING_OBJECT(object) +#endif /* __KERNEL_CPU__ */ + +CCL_NAMESPACE_END + +#endif /* __KERNEL_PROFILING_H__ */ diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index af883aa715b..4b2e675bb21 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -54,6 +54,8 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, const Intersection *isect, const Ray *ray) { + PROFILING_INIT(kg, PROFILING_SHADER_SETUP); + #ifdef __INSTANCING__ sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; #endif @@ -147,6 +149,9 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, differential_incoming(&sd->dI, ray->dD); differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); #endif + + PROFILING_SHADER(sd->shader); + PROFILING_OBJECT(sd->object); } /* ShaderData setup from BSSRDF scatter */ @@ -163,6 +168,8 @@ void shader_setup_from_subsurface( const Intersection *isect, const Ray *ray) { + PROFILING_INIT(kg, PROFILING_SHADER_SETUP); + const bool backfacing = sd->flag & SD_BACKFACING; /* object, matrices, time, ray_length stay the same */ @@ -233,6 +240,8 @@ void shader_setup_from_subsurface( differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); /* don't modify dP and dI */ # endif + + PROFILING_SHADER(sd->shader); } #endif @@ -249,6 +258,8 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, bool object_space, int lamp) { + PROFILING_INIT(kg, PROFILING_SHADER_SETUP); + /* vectors */ sd->P = P; sd->N = Ng; @@ -353,6 +364,9 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, sd->du = differential_zero(); sd->dv = differential_zero(); #endif + + PROFILING_SHADER(sd->shader); + PROFILING_OBJECT(sd->object); } /* ShaderData setup for displacement */ @@ -380,6 +394,8 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd, ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray) { + PROFILING_INIT(kg, PROFILING_SHADER_SETUP); + /* vectors */ sd->P = ray->D; sd->N = -ray->D; @@ -414,6 +430,9 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat sd->du = differential_zero(); sd->dv = differential_zero(); #endif + + PROFILING_SHADER(sd->shader); + PROFILING_OBJECT(sd->object); } /* ShaderData setup from point inside volume */ @@ -421,6 +440,8 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat #ifdef __VOLUME__ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray) { + PROFILING_INIT(kg, PROFILING_SHADER_SETUP); + /* vectors */ sd->P = ray->P; sd->N = -ray->D; @@ -461,6 +482,9 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s /* for NDC coordinates */ sd->ray_P = ray->P; sd->ray_dP = ray->dP; + + PROFILING_SHADER(sd->shader); + PROFILING_OBJECT(sd->object); } #endif /* __VOLUME__ */ @@ -591,6 +615,8 @@ void shader_bsdf_eval(KernelGlobals *kg, float light_pdf, bool use_mis) { + PROFILING_INIT(kg, PROFILING_CLOSURE_EVAL); + bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass); #ifdef __BRANCHED_PATH__ @@ -720,6 +746,8 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, differential3 *domega_in, float *pdf) { + PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE); + const ShaderClosure *sc = shader_bsdf_pick(sd, &randu); if(sc == NULL) { *pdf = 0.0f; @@ -751,6 +779,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float randu, float randv, BsdfEval *bsdf_eval, float3 *omega_in, differential3 *domega_in, float *pdf) { + PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE); + int label; float3 eval; @@ -984,6 +1014,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, int path_flag) { + PROFILING_INIT(kg, PROFILING_SHADER_EVAL); + /* If path is being terminated, we are tracing a shadow ray or evaluating * emission, then we don't need to store closures. The emission and shadow * shader data also do not have a closure array to save GPU memory. */ @@ -1084,6 +1116,8 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con ccl_device void shader_volume_phase_eval(KernelGlobals *kg, const ShaderData *sd, const float3 omega_in, BsdfEval *eval, float *pdf) { + PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_EVAL); + bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass); _shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f); @@ -1093,6 +1127,8 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s float randu, float randv, BsdfEval *phase_eval, float3 *omega_in, differential3 *domega_in, float *pdf) { + PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE); + int sampled = 0; if(sd->num_closure > 1) { @@ -1151,6 +1187,8 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData * const ShaderClosure *sc, float randu, float randv, BsdfEval *phase_eval, float3 *omega_in, differential3 *domega_in, float *pdf) { + PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE); + int label; float3 eval; diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp index dc7a1043208..b34d16c438b 100644 --- a/intern/cycles/render/object.cpp +++ b/intern/cycles/render/object.cpp @@ -335,6 +335,11 @@ uint Object::visibility_for_tracing() const { return trace_visibility; } +int Object::get_device_index() const +{ + return index; +} + /* Object Manager */ ObjectManager::ObjectManager() @@ -348,10 +353,9 @@ ObjectManager::~ObjectManager() } void ObjectManager::device_update_object_transform(UpdateObjectTransformState *state, - Object *ob, - int object_index) + Object *ob) { - KernelObject& kobject = state->objects[object_index]; + KernelObject& kobject = state->objects[ob->index]; Transform *object_motion_pass = state->object_motion_pass; Mesh *mesh = ob->mesh; @@ -457,13 +461,13 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s tfm_post = tfm_post * itfm; } - int motion_pass_offset = object_index*OBJECT_MOTION_PASS_SIZE; + int motion_pass_offset = ob->index*OBJECT_MOTION_PASS_SIZE; object_motion_pass[motion_pass_offset + 0] = tfm_pre; object_motion_pass[motion_pass_offset + 1] = tfm_post; } else if(state->need_motion == Scene::MOTION_BLUR) { if(ob->use_motion()) { - kobject.motion_offset = state->motion_offset[object_index]; + kobject.motion_offset = state->motion_offset[ob->index]; /* Decompose transforms for interpolation. */ DecomposedTransform *decomp = state->object_motion + kobject.motion_offset; @@ -494,7 +498,7 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s if(ob->use_holdout) { flag |= SD_OBJECT_HOLDOUT_MASK; } - state->object_flag[object_index] = flag; + state->object_flag[ob->index] = flag; /* Have curves. */ if(mesh->num_curves()) { @@ -538,7 +542,7 @@ void ObjectManager::device_update_object_transform_task( for(int i = 0; i < num_objects; ++i) { const int object_index = start_index + i; Object *ob = state->scene->objects[object_index]; - device_update_object_transform(state, ob, object_index); + device_update_object_transform(state, ob); } } } @@ -593,10 +597,8 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene, * need some tweaks to make mid-complex scenes optimal. */ if(scene->objects.size() < 64) { - int object_index = 0; foreach(Object *ob, scene->objects) { - device_update_object_transform(&state, ob, object_index); - object_index++; + device_update_object_transform(&state, ob); if(progress.get_cancel()) { return; } @@ -642,6 +644,12 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc if(scene->objects.size() == 0) return; + /* Assign object IDs. */ + int index = 0; + foreach(Object *object, scene->objects) { + object->index = index++; + } + /* set object transform matrices, before applying static transforms */ progress.set_status("Updating Objects", "Copying Transformations to device"); device_update_transforms(dscene, scene, progress); @@ -686,26 +694,25 @@ void ObjectManager::device_update_flags(Device *, } } - int object_index = 0; foreach(Object *object, scene->objects) { if(object->mesh->has_volume) { - object_flag[object_index] |= SD_OBJECT_HAS_VOLUME; - object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES; + object_flag[object->index] |= SD_OBJECT_HAS_VOLUME; + object_flag[object->index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES; foreach(Attribute& attr, object->mesh->attributes.attributes) { if(attr.element == ATTR_ELEMENT_VOXEL) { - object_flag[object_index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES; + object_flag[object->index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES; } } } else { - object_flag[object_index] &= ~(SD_OBJECT_HAS_VOLUME|SD_OBJECT_HAS_VOLUME_ATTRIBUTES); + object_flag[object->index] &= ~(SD_OBJECT_HAS_VOLUME|SD_OBJECT_HAS_VOLUME_ATTRIBUTES); } if(object->is_shadow_catcher) { - object_flag[object_index] |= SD_OBJECT_SHADOW_CATCHER; + object_flag[object->index] |= SD_OBJECT_SHADOW_CATCHER; } else { - object_flag[object_index] &= ~SD_OBJECT_SHADOW_CATCHER; + object_flag[object->index] &= ~SD_OBJECT_SHADOW_CATCHER; } if(bounds_valid) { @@ -714,7 +721,7 @@ void ObjectManager::device_update_flags(Device *, continue; } if(object->bounds.intersects(volume_object->bounds)) { - object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME; + object_flag[object->index] |= SD_OBJECT_INTERSECTS_VOLUME; break; } } @@ -723,9 +730,8 @@ void ObjectManager::device_update_flags(Device *, /* Not really valid, but can't make more reliable in the case * of bounds not being up to date. */ - object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME; + object_flag[object->index] |= SD_OBJECT_INTERSECTS_VOLUME; } - ++object_index; } /* Copy object flag. */ @@ -741,7 +747,6 @@ void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Sc KernelObject *kobjects = dscene->objects.data(); bool update = false; - int object_index = 0; foreach(Object *object, scene->objects) { Mesh* mesh = object->mesh; @@ -750,18 +755,16 @@ void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Sc uint patch_map_offset = 2*(mesh->patch_table_offset + mesh->patch_table->total_size() - mesh->patch_table->num_nodes * PATCH_NODE_SIZE) - mesh->patch_offset; - if(kobjects[object_index].patch_map_offset != patch_map_offset) { - kobjects[object_index].patch_map_offset = patch_map_offset; + if(kobjects[object->index].patch_map_offset != patch_map_offset) { + kobjects[object->index].patch_map_offset = patch_map_offset; update = true; } } - if(kobjects[object_index].attribute_map_offset != mesh->attr_map_offset) { - kobjects[object_index].attribute_map_offset = mesh->attr_map_offset; + if(kobjects[object->index].attribute_map_offset != mesh->attr_map_offset) { + kobjects[object->index].attribute_map_offset = mesh->attr_map_offset; update = true; } - - object_index++; } if(update) { diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h index 87e6e6652ad..134f0bc3577 100644 --- a/intern/cycles/render/object.h +++ b/intern/cycles/render/object.h @@ -38,6 +38,7 @@ class Progress; class Scene; struct Transform; struct UpdateObjectTransformState; +class ObjectManager; /* Object */ @@ -88,6 +89,16 @@ public: * determined flags which denotes trace-time visibility. */ uint visibility_for_tracing() const; + + /* Returns the index that is used in the kernel for this object. */ + int get_device_index() const; + +protected: + /* Specifies the position of the object in scene->objects and + * in the device vectors. Gets set in device_update. */ + int index; + + friend class ObjectManager; }; /* Object Manager */ @@ -123,8 +134,7 @@ public: protected: void device_update_object_transform(UpdateObjectTransformState *state, - Object *ob, - const int object_index); + Object *ob); void device_update_object_transform_task(UpdateObjectTransformState *state); bool device_update_object_transform_pop_work( UpdateObjectTransformState *state, diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index d6ecafa19b7..a8374b662d0 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -250,7 +250,9 @@ void Session::run_gpu() if(!no_tiles) { /* update scene */ scoped_timer update_timer; - update_scene(); + if(update_scene()) { + stats.profiler.reset(scene->shaders.size(), scene->objects.size()); + } progress.add_skip_time(update_timer, params.background); if(!device->error_message().empty()) @@ -585,7 +587,9 @@ void Session::run_cpu() /* update scene */ scoped_timer update_timer; - update_scene(); + if(update_scene()) { + stats.profiler.reset(scene->shaders.size(), scene->objects.size()); + } progress.add_skip_time(update_timer, params.background); if(!device->error_message().empty()) @@ -729,6 +733,10 @@ void Session::run() /* load kernels */ load_kernels(); + if(params.use_profiling && (params.device.type == DEVICE_CPU)) { + stats.profiler.start(); + } + /* session thread loop */ progress.set_status("Waiting for render to start"); @@ -743,6 +751,8 @@ void Session::run() run_cpu(); } + stats.profiler.stop(); + /* progress update */ if(progress.get_cancel()) progress.set_status("Cancel", progress.get_cancel_message()); @@ -825,7 +835,7 @@ void Session::wait() session_thread = NULL; } -void Session::update_scene() +bool Session::update_scene() { thread_scoped_lock scene_lock(scene->mutex); @@ -876,7 +886,10 @@ void Session::update_scene() progress.set_status("Updating Scene"); MEM_GUARDED_CALL(&progress, scene->device_update, device, progress); + + return true; } + return false; } void Session::update_status_time(bool show_pause, bool show_done) @@ -1052,6 +1065,14 @@ void Session::device_free() */ } +void Session::collect_statistics(RenderStats *render_stats) +{ + scene->collect_statistics(render_stats); + if(params.use_profiling && (params.device.type == DEVICE_CPU)) { + render_stats->collect_profiling(scene, &stats); + } +} + int Session::get_max_closure_count() { int max_closures = 0; diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 56a69919a7a..db07c758427 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -20,6 +20,7 @@ #include "render/buffers.h" #include "device/device.h" #include "render/shader.h" +#include "render/stats.h" #include "render/tile.h" #include "util/util_progress.h" @@ -55,6 +56,8 @@ public: int pixel_size; int threads; + bool use_profiling; + bool display_buffer_linear; bool use_denoising; @@ -89,6 +92,8 @@ public: pixel_size = 1; threads = 0; + use_profiling = false; + use_denoising = false; denoising_passes = false; denoising_radius = 8; @@ -118,6 +123,7 @@ public: && start_resolution == params.start_resolution && pixel_size == params.pixel_size && threads == params.threads + && use_profiling == params.use_profiling && display_buffer_linear == params.display_buffer_linear && cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout @@ -159,7 +165,7 @@ public: void set_samples(int samples); void set_pause(bool pause); - void update_scene(); + bool update_scene(); void load_kernels(bool lock_scene=true); void device_free(); @@ -168,6 +174,8 @@ public: * (for example, when rendering with unlimited samples). */ float get_progress(); + void collect_statistics(RenderStats *stats); + protected: struct DelayedReset { thread_mutex mutex; diff --git a/intern/cycles/render/stats.cpp b/intern/cycles/render/stats.cpp index 101d33fcf65..71d9353ad3d 100644 --- a/intern/cycles/render/stats.cpp +++ b/intern/cycles/render/stats.cpp @@ -15,6 +15,7 @@ */ #include "render/stats.h" +#include "render/object.h" #include "util/util_algorithm.h" #include "util/util_foreach.h" #include "util/util_string.h" @@ -33,6 +34,16 @@ bool namedSizeEntryComparator(const NamedSizeEntry& a, const NamedSizeEntry& b) return a.size > b.size; } +bool namedTimeSampleEntryComparator(const NamedNestedSampleStats& a, const NamedNestedSampleStats& b) +{ + return a.sum_samples > b.sum_samples; +} + +bool namedSampleCountPairComparator(const NamedSampleCountPair& a, const NamedSampleCountPair& b) +{ + return a.samples > b.samples; +} + } // namespace NamedSizeEntry::NamedSizeEntry() @@ -77,6 +88,113 @@ string NamedSizeStats::full_report(int indent_level) return result; } +/* Named time sample statistics. */ + +NamedNestedSampleStats::NamedNestedSampleStats() + : name(""), self_samples(0), sum_samples(0) +{} + +NamedNestedSampleStats::NamedNestedSampleStats(const string& name, uint64_t samples) + : name(name), self_samples(samples), sum_samples(samples) +{} + +NamedNestedSampleStats& NamedNestedSampleStats::add_entry(const string& name_, uint64_t samples_) +{ + entries.push_back(NamedNestedSampleStats(name_, samples_)); + return entries[entries.size()-1]; +} + +void NamedNestedSampleStats::update_sum() +{ + sum_samples = self_samples; + foreach(NamedNestedSampleStats& entry, entries) { + entry.update_sum(); + sum_samples += entry.sum_samples; + } +} + +string NamedNestedSampleStats::full_report(int indent_level, uint64_t total_samples) +{ + update_sum(); + + if(total_samples == 0) { + total_samples = sum_samples; + } + + const string indent(indent_level * kIndentNumSpaces, ' '); + + const double sum_percent = 100*((double) sum_samples) / total_samples; + const double sum_seconds = sum_samples * 0.001; + const double self_percent = 100*((double) self_samples) / total_samples; + const double self_seconds = self_samples * 0.001; + string info = string_printf("%-32s: Total %3.2f%% (%.2fs), Self %3.2f%% (%.2fs)\n", + name.c_str(), + sum_percent, + sum_seconds, + self_percent, + self_seconds); + string result = indent + info; + + sort(entries.begin(), entries.end(), namedTimeSampleEntryComparator); + foreach(NamedNestedSampleStats& entry, entries) { + result += entry.full_report(indent_level + 1, total_samples); + } + return result; +} + +/* Named sample count pairs. */ + +NamedSampleCountPair::NamedSampleCountPair(const ustring& name, uint64_t samples, uint64_t hits) + : name(name), samples(samples), hits(hits) +{} + +NamedSampleCountStats::NamedSampleCountStats() +{} + +void NamedSampleCountStats::add(const ustring& name, uint64_t samples, uint64_t hits) +{ + entry_map::iterator entry = entries.find(name); + if(entry != entries.end()) { + entry->second.samples += samples; + entry->second.hits += hits; + return; + } + entries.emplace(name, NamedSampleCountPair(name, samples, hits)); +} + +string NamedSampleCountStats::full_report(int indent_level) +{ + const string indent(indent_level * kIndentNumSpaces, ' '); + + vector<NamedSampleCountPair> sorted_entries; + sorted_entries.reserve(entries.size()); + + uint64_t total_hits = 0, total_samples = 0; + foreach(entry_map::const_reference entry, entries) { + const NamedSampleCountPair &pair = entry.second; + + total_hits += pair.hits; + total_samples += pair.samples; + + sorted_entries.push_back(pair); + } + const double avg_samples_per_hit = ((double) total_samples) / total_hits; + + sort(sorted_entries.begin(), sorted_entries.end(), namedSampleCountPairComparator); + + string result = ""; + foreach(const NamedSampleCountPair& entry, sorted_entries) { + const double seconds = entry.samples * 0.001; + const double relative = ((double) entry.samples) / (entry.hits * avg_samples_per_hit); + + result += indent + string_printf("%-32s: %.2fs (Relative cost: %.2f)\n", + entry.name.c_str(), + seconds, + relative); + } + return result; +} + /* Mesh statistics. */ MeshStats::MeshStats() { @@ -106,6 +224,74 @@ string ImageStats::full_report(int indent_level) /* Overall statistics. */ RenderStats::RenderStats() { + has_profiling = false; +} + +void RenderStats::collect_profiling(Scene *scene, Stats *stats) +{ + has_profiling = true; + + Profiler &prof = stats->profiler; + + kernel = NamedNestedSampleStats("Total render time", prof.get_event(PROFILING_UNKNOWN)); + + kernel.add_entry("Ray setup", prof.get_event(PROFILING_RAY_SETUP)); + kernel.add_entry("Result writing", prof.get_event(PROFILING_WRITE_RESULT)); + + NamedNestedSampleStats &integrator = kernel.add_entry("Path integration", prof.get_event(PROFILING_PATH_INTEGRATE)); + integrator.add_entry("Scene intersection", prof.get_event(PROFILING_SCENE_INTERSECT)); + integrator.add_entry("Indirect emission", prof.get_event(PROFILING_INDIRECT_EMISSION)); + integrator.add_entry("Volumes", prof.get_event(PROFILING_VOLUME)); + + NamedNestedSampleStats &shading = integrator.add_entry("Shading", 0); + shading.add_entry("Shader Setup", prof.get_event(PROFILING_SHADER_SETUP)); + shading.add_entry("Shader Eval", prof.get_event(PROFILING_SHADER_EVAL)); + shading.add_entry("Shader Apply", prof.get_event(PROFILING_SHADER_APPLY)); + shading.add_entry("Ambient Occlusion", prof.get_event(PROFILING_AO)); + shading.add_entry("Subsurface", prof.get_event(PROFILING_SUBSURFACE)); + + integrator.add_entry("Connect Light", prof.get_event(PROFILING_CONNECT_LIGHT)); + integrator.add_entry("Surface Bounce", prof.get_event(PROFILING_SURFACE_BOUNCE)); + + NamedNestedSampleStats &intersection = kernel.add_entry("Intersection", 0); + intersection.add_entry("Full Intersection", prof.get_event(PROFILING_INTERSECT)); + intersection.add_entry("Local Intersection", prof.get_event(PROFILING_INTERSECT_LOCAL)); + intersection.add_entry("Shadow All Intersection", prof.get_event(PROFILING_INTERSECT_SHADOW_ALL)); + intersection.add_entry("Volume Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME)); + intersection.add_entry("Volume All Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME_ALL)); + + NamedNestedSampleStats &closure = kernel.add_entry("Closures", 0); + closure.add_entry("Surface Closure Evaluation", prof.get_event(PROFILING_CLOSURE_EVAL)); + closure.add_entry("Surface Closure Sampling", prof.get_event(PROFILING_CLOSURE_SAMPLE)); + closure.add_entry("Volume Closure Evaluation", prof.get_event(PROFILING_CLOSURE_VOLUME_EVAL)); + closure.add_entry("Volume Closure Sampling", prof.get_event(PROFILING_CLOSURE_VOLUME_SAMPLE)); + + NamedNestedSampleStats &denoising = kernel.add_entry("Denoising", prof.get_event(PROFILING_DENOISING)); + denoising.add_entry("Construct Transform", prof.get_event(PROFILING_DENOISING_CONSTRUCT_TRANSFORM)); + denoising.add_entry("Reconstruct", prof.get_event(PROFILING_DENOISING_RECONSTRUCT)); + + NamedNestedSampleStats &prefilter = denoising.add_entry("Prefiltering", 0); + prefilter.add_entry("Divide Shadow", prof.get_event(PROFILING_DENOISING_DIVIDE_SHADOW)); + prefilter.add_entry("Non-Local means", prof.get_event(PROFILING_DENOISING_NON_LOCAL_MEANS)); + prefilter.add_entry("Get Feature", prof.get_event(PROFILING_DENOISING_GET_FEATURE)); + prefilter.add_entry("Detect Outliers", prof.get_event(PROFILING_DENOISING_DETECT_OUTLIERS)); + prefilter.add_entry("Combine Halves", prof.get_event(PROFILING_DENOISING_COMBINE_HALVES)); + + shaders.entries.clear(); + foreach(Shader *shader, scene->shaders) { + uint64_t samples, hits; + if(prof.get_shader(shader->id, samples, hits)) { + shaders.add(shader->name, samples, hits); + } + } + + objects.entries.clear(); + foreach(Object *object, scene->objects) { + uint64_t samples, hits; + if(prof.get_object(object->get_device_index(), samples, hits)) { + objects.add(object->name, samples, hits); + } + } } string RenderStats::full_report() @@ -113,6 +299,14 @@ string RenderStats::full_report() string result = ""; result += "Mesh statistics:\n" + mesh.full_report(1); result += "Image statistics:\n" + image.full_report(1); + if(has_profiling) { + result += "Kernel statistics:\n" + kernel.full_report(1); + result += "Shader statistics:\n" + shaders.full_report(1); + result += "Object statistics:\n" + objects.full_report(1); + } + else { + result += "Profiling information not available (only works with CPU rendering)"; + } return result; } diff --git a/intern/cycles/render/stats.h b/intern/cycles/render/stats.h index 2ff0ec3e0e9..ce0529bb8bd 100644 --- a/intern/cycles/render/stats.h +++ b/intern/cycles/render/stats.h @@ -17,6 +17,9 @@ #ifndef __RENDER_STATS_H__ #define __RENDER_STATS_H__ +#include "render/scene.h" + +#include "util/util_stats.h" #include "util/util_string.h" #include "util/util_vector.h" @@ -61,6 +64,51 @@ public: vector<NamedSizeEntry> entries; }; +class NamedNestedSampleStats { +public: + NamedNestedSampleStats(); + NamedNestedSampleStats(const string& name, uint64_t samples); + + NamedNestedSampleStats& add_entry(const string& name, uint64_t samples); + + /* Updates sum_samples recursively. */ + void update_sum(); + + string full_report(int indent_level = 0, uint64_t total_samples = 0); + + string name; + + /* self_samples contains only the samples that this specific event got, + * while sum_samples also includes the samples of all sub-entries. */ + uint64_t self_samples, sum_samples; + + vector<NamedNestedSampleStats> entries; +}; + +/* Named entry containing both a time-sample count for objects of a type and a + * total count of processed items. + * This allows to estimate the time spent per item. */ +class NamedSampleCountPair { +public: + NamedSampleCountPair(const ustring& name, uint64_t samples, uint64_t hits); + + ustring name; + uint64_t samples; + uint64_t hits; +}; + +/* Contains statistics about pairs of samples and counts as described above. */ +class NamedSampleCountStats { +public: + NamedSampleCountStats(); + + string full_report(int indent_level = 0); + void add(const ustring& name, uint64_t samples, uint64_t hits); + + typedef unordered_map<ustring, NamedSampleCountPair, ustringHash> entry_map; + entry_map entries; +}; + /* Statistics about mesh in the render database. */ class MeshStats { public: @@ -95,8 +143,16 @@ public: /* Return full report as string. */ string full_report(); + /* Collect kernel sampling information from Stats. */ + void collect_profiling(Scene *scene, Stats *stats); + + bool has_profiling; + MeshStats mesh; ImageStats image; + NamedNestedSampleStats kernel; + NamedSampleCountStats shaders; + NamedSampleCountStats objects; }; CCL_NAMESPACE_END diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 77d47984ee7..92dfc9fa85d 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -17,6 +17,7 @@ set(SRC util_md5.cpp util_murmurhash.cpp util_path.cpp + util_profiling.cpp util_string.cpp util_simd.cpp util_system.cpp @@ -71,6 +72,7 @@ set(SRC_HEADERS util_optimization.h util_param.h util_path.h + util_profiling.h util_progress.h util_projection.h util_queue.h diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp new file mode 100644 index 00000000000..30aaef69310 --- /dev/null +++ b/intern/cycles/util/util_profiling.cpp @@ -0,0 +1,178 @@ +/* + * Copyright 2011-2018 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/util_algorithm.h" +#include "util/util_profiling.h" +#include "util/util_set.h" + +CCL_NAMESPACE_BEGIN + +Profiler::Profiler() + : do_stop_worker(true), worker(NULL) +{ +} + +Profiler::~Profiler() +{ + assert(worker == NULL); +} + +void Profiler::run() +{ + uint64_t updates = 0; + auto start_time = std::chrono::system_clock::now(); + while(!do_stop_worker) { + thread_scoped_lock lock(mutex); + foreach(ProfilingState *state, states) { + uint32_t cur_event = state->event; + int32_t cur_shader = state->shader; + int32_t cur_object = state->object; + + /* The state reads/writes should be atomic, but just to be sure + * check the values for validity anyways. */ + if(cur_event < PROFILING_NUM_EVENTS) { + event_samples[cur_event]++; + } + + if(cur_shader >= 0 && cur_shader < shader_samples.size()) { + /* Only consider the active shader during events whose runtime significantly depends on it. */ + if(((cur_event >= PROFILING_SHADER_EVAL ) && (cur_event <= PROFILING_SUBSURFACE)) || + ((cur_event >= PROFILING_CLOSURE_EVAL) && (cur_event <= PROFILING_CLOSURE_VOLUME_SAMPLE))) { + shader_samples[cur_shader]++; + } + } + + if(cur_object >= 0 && cur_object < object_samples.size()) { + object_samples[cur_object]++; + } + } + lock.unlock(); + + /* Relative waits always overshoot a bit, so just waiting 1ms every + * time would cause the sampling to drift over time. + * By keeping track of the absolute time, the wait times correct themselves - + * if one wait overshoots a lot, the next one will be shorter to compensate. */ + updates++; + std::this_thread::sleep_until(start_time + updates*std::chrono::milliseconds(1)); + } +} + +void Profiler::reset(int num_shaders, int num_objects) +{ + bool running = (worker != NULL); + if(running) { + stop(); + } + + /* Resize and clear the accumulation vectors. */ + shader_hits.assign(num_shaders, 0); + object_hits.assign(num_objects, 0); + + event_samples.assign(PROFILING_NUM_EVENTS, 0); + shader_samples.assign(num_shaders, 0); + object_samples.assign(num_objects, 0); + + if(running) { + start(); + } +} + +void Profiler::start() +{ + assert(worker == NULL); + do_stop_worker = false; + worker = new thread(function_bind(&Profiler::run, this)); +} + +void Profiler::stop() +{ + if(worker != NULL) { + do_stop_worker = true; + + worker->join(); + delete worker; + worker = NULL; + } +} + +void Profiler::add_state(ProfilingState *state) +{ + thread_scoped_lock lock(mutex); + + /* Add the ProfilingState from the list of sampled states. */ + assert(std::find(states.begin(), states.end(), state) == states.end()); + states.push_back(state); + + /* Resize thread-local hit counters. */ + state->shader_hits.assign(shader_hits.size(), 0); + state->object_hits.assign(object_hits.size(), 0); + + /* Initialize the state. */ + state->event = PROFILING_UNKNOWN; + state->shader = -1; + state->object = -1; + state->active = true; +} + +void Profiler::remove_state(ProfilingState *state) +{ + thread_scoped_lock lock(mutex); + + /* Remove the ProfilingState from the list of sampled states. */ + states.erase(std::remove(states.begin(), states.end(), state), states.end()); + state->active = false; + + /* Merge thread-local hit counters. */ + assert(shader_hits.size() == state->shader_hits.size()); + for(int i = 0; i < shader_hits.size(); i++) { + shader_hits[i] += state->shader_hits[i]; + } + + assert(object_hits.size() == state->object_hits.size()); + for(int i = 0; i < object_hits.size(); i++) { + object_hits[i] += state->object_hits[i]; + } +} + +uint64_t Profiler::get_event(ProfilingEvent event) +{ + assert(worker == NULL); + return event_samples[event]; +} + +bool Profiler::get_shader(int shader, uint64_t &samples, uint64_t &hits) +{ + assert(worker == NULL); + if(shader_samples[shader] == 0) { + return false; + } + samples = shader_samples[shader]; + hits = shader_hits[shader]; + return true; +} + +bool Profiler::get_object(int object, uint64_t &samples, uint64_t &hits) +{ + assert(worker == NULL); + if(object_samples[object] == 0) { + return false; + } + samples = object_samples[object]; + hits = object_hits[object]; + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h new file mode 100644 index 00000000000..4460402b12d --- /dev/null +++ b/intern/cycles/util/util_profiling.h @@ -0,0 +1,175 @@ +/* + * Copyright 2011-2018 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_PROFILING_H__ +#define __UTIL_PROFILING_H__ + +#include <atomic> + +#include "util/util_foreach.h" +#include "util/util_map.h" +#include "util/util_thread.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +enum ccl_try_align(16) ProfilingEvent : uint32_t { + PROFILING_UNKNOWN, + PROFILING_RAY_SETUP, + PROFILING_PATH_INTEGRATE, + PROFILING_SCENE_INTERSECT, + PROFILING_INDIRECT_EMISSION, + PROFILING_VOLUME, + PROFILING_SHADER_SETUP, + PROFILING_SHADER_EVAL, + PROFILING_SHADER_APPLY, + PROFILING_AO, + PROFILING_SUBSURFACE, + PROFILING_CONNECT_LIGHT, + PROFILING_SURFACE_BOUNCE, + PROFILING_WRITE_RESULT, + + PROFILING_INTERSECT, + PROFILING_INTERSECT_LOCAL, + PROFILING_INTERSECT_SHADOW_ALL, + PROFILING_INTERSECT_VOLUME, + PROFILING_INTERSECT_VOLUME_ALL, + + PROFILING_CLOSURE_EVAL, + PROFILING_CLOSURE_SAMPLE, + PROFILING_CLOSURE_VOLUME_EVAL, + PROFILING_CLOSURE_VOLUME_SAMPLE, + + PROFILING_DENOISING, + PROFILING_DENOISING_CONSTRUCT_TRANSFORM, + PROFILING_DENOISING_RECONSTRUCT, + PROFILING_DENOISING_DIVIDE_SHADOW, + PROFILING_DENOISING_NON_LOCAL_MEANS, + PROFILING_DENOISING_COMBINE_HALVES, + PROFILING_DENOISING_GET_FEATURE, + PROFILING_DENOISING_DETECT_OUTLIERS, + + PROFILING_NUM_EVENTS, +}; + +/* Contains the current execution state of a worker thread. + * These values are constantly updated by the worker. + * Periodically the profiler thread will wake up, read them + * and update its internal counters based on it. + * + * Atomics aren't needed here since we're only doing direct + * writes and reads to (4-byte-aligned) uint32_t, which is + * guaranteed to be atomic on x86 since the 486. + * Memory ordering is not guaranteed but does not matter. + * + * And even on other architectures, the extremely rare corner + * case of reading an intermediate state could at worst result + * in a single incorrect sample. */ +struct ProfilingState { + volatile uint32_t event = PROFILING_UNKNOWN; + volatile int32_t shader = -1; + volatile int32_t object = -1; + volatile bool active = false; + + vector<uint64_t> shader_hits; + vector<uint64_t> object_hits; +}; + +class Profiler { +public: + Profiler(); + ~Profiler(); + + void reset(int num_shaders, int num_objects); + + void start(); + void stop(); + + void add_state(ProfilingState *state); + void remove_state(ProfilingState *state); + + uint64_t get_event(ProfilingEvent event); + bool get_shader(int shader, uint64_t &samples, uint64_t &hits); + bool get_object(int object, uint64_t &samples, uint64_t &hits); + +protected: + void run(); + + /* Tracks how often the worker was in each ProfilingEvent while sampling, + * so multiplying the values by the sample frequency (currently 1ms) + * gives the approximate time spent in each state. */ + vector<uint64_t> event_samples; + vector<uint64_t> shader_samples; + vector<uint64_t> object_samples; + + /* Tracks the total amounts every object/shader was hit. + * Used to evaluate relative cost, written by the render thread. + * Indexed by the shader and object IDs that the kernel also uses + * to index __object_flag and __shaders. */ + vector<uint64_t> shader_hits; + vector<uint64_t> object_hits; + + volatile bool do_stop_worker; + thread *worker; + + thread_mutex mutex; + vector<ProfilingState*> states; +}; + +class ProfilingHelper { +public: + ProfilingHelper(ProfilingState *state, ProfilingEvent event) + : state(state) + { + previous_event = state->event; + state->event = event; + } + + inline void set_event(ProfilingEvent event) + { + state->event = event; + } + + inline void set_shader(int shader) + { + state->shader = shader; + if(state->active) { + assert(shader < state->shader_hits.size()); + state->shader_hits[shader]++; + } + } + + inline void set_object(int object) + { + state->object = object; + if(state->active) { + assert(object < state->object_hits.size()); + state->object_hits[object]++; + } + } + + ~ProfilingHelper() + { + state->event = previous_event; + } +private: + ProfilingState *state; + uint32_t previous_event; +}; + +CCL_NAMESPACE_END + +#endif /* __UTIL_PROFILING_H__ */ diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h index 0ba58422a67..1128bc3ced1 100644 --- a/intern/cycles/util/util_stats.h +++ b/intern/cycles/util/util_stats.h @@ -18,6 +18,7 @@ #define __UTIL_STATS_H__ #include "util/util_atomic.h" +#include "util/util_profiling.h" CCL_NAMESPACE_BEGIN @@ -40,6 +41,8 @@ public: size_t mem_used; size_t mem_peak; + + Profiler profiler; }; CCL_NAMESPACE_END |