Cycles: Add sample-based runtime profiler that measures time spent in various parts of the CPU kernel

This commit adds a sample-based profiler that runs during CPU rendering and collects statistics on time spent in different parts of the kernel (ray intersection, shader evaluation etc.) as well as time spent per material and object. The results are currently not exposed in the user interface or per Python yet, to see the stats on the console pass the "--cycles-print-stats" argument to Cycles (e.g. "./blender -- --cycles-print-stats"). Unfortunately, there is no clear way to extend this functionality to CUDA or OpenCL, so it is CPU-only for now. Reviewers: brecht, sergey, swerner Reviewed By: brecht, swerner Differential Revision: https://developer.blender.org/D3892
author: Lukas Stockner <lukas.stockner@freenet.de> 2018-11-29 04:06:30 +0300
committer: Lukas Stockner <lukas.stockner@freenet.de> 2018-11-29 04:45:24 +0300
commit: 7fa6f72084b1364cddfbef4f06bbb244210d6967 (patch)
tree: ae4f682248bd5ba4f716ff60c6dbd67c5684b3d2 /intern
parent: fb057153b05555606d801d1e942113d40ec15cec (diff)
26 files changed, 837 insertions, 36 deletions
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 75c7dcee05e..30ae0bc813d 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -516,7 +516,7 @@ void BlenderSession::render()
 
 		if(!b_engine.is_preview() && background && print_render_stats) {
 			RenderStats stats;
-			session->scene->collect_statistics(&stats);
+			session->collect_statistics(&stats);
 			printf("Render statistics:\n%s\n", stats.full_report().c_str());
 		}
 
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 832847c179f..49b046d0a88 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -874,7 +874,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 	}
 
 	/* tiles */
-	if(params.device.type != DEVICE_CPU && !background) {
+	const bool is_cpu = (params.device.type == DEVICE_CPU);
+	if(!is_cpu && !background) {
 		/* currently GPU could be much slower than CPU when using tiles,
 		 * still need to be investigated, but meanwhile make it possible
 		 * to work in viewport smoothly
@@ -960,6 +961,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 		params.progressive_update_timeout = 0.1;
 	}
 
+	params.use_profiling = params.device.has_profiling && !b_engine.is_preview() &&
+	                       background && BlenderSession::print_render_stats;
+
 	return params;
 }
 
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 7e20bb449c3..54ffd4bc4df 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -362,6 +362,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th
 	info.has_half_images = true;
 	info.has_volume_decoupled = true;
 	info.has_osl = true;
+	info.has_profiling = true;
 
 	foreach(const DeviceInfo &device, subdevices) {
 		/* Ensure CPU device does not slow down GPU. */
@@ -396,6 +397,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th
 		info.has_half_images &= device.has_half_images;
 		info.has_volume_decoupled &= device.has_volume_decoupled;
 		info.has_osl &= device.has_osl;
+		info.has_profiling &= device.has_profiling;
 	}
 
 	return info;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index f3fb338e638..071f61a7566 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -60,6 +60,7 @@ public:
 	bool has_volume_decoupled;      /* Decoupled volume shading. */
 	bool has_osl;                   /* Support Open Shading Language. */
 	bool use_split_kernel;          /* Use split or mega kernel. */
+	bool has_profiling;             /* Supports runtime collection of profiling info. */
 	int cpu_threads;
 	vector<DeviceInfo> multi_devices;
 
@@ -75,6 +76,7 @@ public:
 		has_volume_decoupled = false;
 		has_osl = false;
 		use_split_kernel = false;
+		has_profiling = false;
 	}
 
 	bool operator==(const DeviceInfo &info) {
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 36fe9bfc92b..f0a6fd6e3f4 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -477,6 +477,8 @@ public:
 	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
 	                               DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
+
 		int4 rect = task->rect;
 		int   r   = task->nlm_state.r;
 		int   f   = task->nlm_state.f;
@@ -529,6 +531,8 @@ public:
 
 	bool denoising_construct_transform(DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
+
 		for(int y = 0; y < task->filter_area.w; y++) {
 			for(int x = 0; x < task->filter_area.z; x++) {
 				filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
@@ -551,6 +555,8 @@ public:
 	                           device_ptr output_ptr,
 	                           DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
+
 		mem_zero(task->storage.XtWX);
 		mem_zero(task->storage.XtWY);
 
@@ -609,8 +615,10 @@ public:
 
 	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
 	                              device_ptr mean_ptr, device_ptr variance_ptr,
-	                              int r, int4 rect, DenoisingTask * /*task*/)
+	                              int r, int4 rect, DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
+
 		for(int y = rect.y; y < rect.w; y++) {
 			for(int x = rect.x; x < rect.z; x++) {
 				filter_combine_halves_kernel()(x, y,
@@ -629,6 +637,8 @@ public:
 	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
 	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
+
 		for(int y = task->rect.y; y < task->rect.w; y++) {
 			for(int x = task->rect.x; x < task->rect.z; x++) {
 				filter_divide_shadow_kernel()(task->render_buffer.samples,
@@ -653,6 +663,8 @@ public:
 	                           device_ptr variance_ptr,
 	                           DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
+
 		for(int y = task->rect.y; y < task->rect.w; y++) {
 			for(int x = task->rect.x; x < task->rect.z; x++) {
 				filter_get_feature_kernel()(task->render_buffer.samples,
@@ -676,6 +688,8 @@ public:
 	                               device_ptr output_ptr,
 	                               DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
+
 		for(int y = task->rect.y; y < task->rect.w; y++) {
 			for(int x = task->rect.x; x < task->rect.z; x++) {
 				filter_detect_outliers_kernel()(x, y,
@@ -735,6 +749,8 @@ public:
 
 	void denoise(DenoisingTask& denoising, RenderTile &tile)
 	{
+		ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
+
 		tile.sample = tile.start_sample + tile.num_samples;
 
 		denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
@@ -765,6 +781,8 @@ public:
 
 		KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
 
+		stats.profiler.add_state(&kg->profiler);
+
 		CPUSplitKernel *split_kernel = NULL;
 		if(use_split_kernel) {
 			split_kernel = new CPUSplitKernel(this);
@@ -778,6 +796,7 @@ public:
 
 		RenderTile tile;
 		DenoisingTask denoising(this, task);
+		denoising.profiler = &kg->profiler;
 
 		while(task.acquire_tile(this, tile)) {
 			if(tile.task == RenderTile::PATH_TRACE) {
@@ -802,6 +821,8 @@ public:
 			}
 		}
 
+		stats.profiler.remove_state(&kg->profiler);
+
 		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
 		kg->~KernelGlobals();
 		kgbuffer.free();
@@ -1061,6 +1082,7 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	info.has_volume_decoupled = true;
 	info.has_osl = true;
 	info.has_half_images = true;
+	info.has_profiling = true;
 
 	devices.insert(devices.begin(), info);
 }
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 78c65a3d22d..433cbd3c265 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -22,6 +22,7 @@ CCL_NAMESPACE_BEGIN
 
 DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
 : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
+  profiler(NULL),
   storage(device),
   buffer(device),
   device(device)
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 8e0666d0e59..beae60c220f 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -23,6 +23,8 @@
 
 #include "kernel/filter/filter_defines.h"
 
+#include "util/util_profiling.h"
+
 CCL_NAMESPACE_BEGIN
 
 class DenoisingTask {
@@ -51,6 +53,8 @@ public:
 	TileInfo *tile_info;
 	device_vector<int> tile_info_mem;
 
+	ProfilingState *profiler;
+
 	int4 rect;
 	int4 filter_area;
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 92cb66bdec9..d4145225b77 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -110,6 +110,7 @@ set(SRC_HEADERS
 	kernel_path_surface.h
 	kernel_path_subsurface.h
 	kernel_path_volume.h
+	kernel_profiling.h
 	kernel_projection.h
 	kernel_queues.h
 	kernel_random.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 6708a3efac1..284b1e9208c 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -186,6 +186,8 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
                                           float difl,
                                           float extmax)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT);
+
 	if(!scene_intersect_valid(&ray)) {
 		return false;
 	}
@@ -248,6 +250,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
                                                 uint *lcg_state,
                                                 int max_hits)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
+
 	if(!scene_intersect_valid(&ray)) {
 		return false;
 	}
@@ -327,6 +331,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
+
 	if(!scene_intersect_valid(ray)) {
 		return false;
 	}
@@ -407,6 +413,8 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
                                                  Intersection *isect,
                                                  const uint visibility)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME);
+
 	if(!scene_intersect_valid(ray)) {
 		return false;
 	}
@@ -438,6 +446,8 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
                                                      const uint max_hits,
                                                      const uint visibility)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL);
+
 	if(!scene_intersect_valid(ray)) {
 		return false;
 	}
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 37402f42863..59f1e252d21 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -19,6 +19,8 @@
 #ifndef __KERNEL_GLOBALS_H__
 #define __KERNEL_GLOBALS_H__
 
+#include "kernel/kernel_profiling.h"
+
 #ifdef __KERNEL_CPU__
 #  include "util/util_vector.h"
 #  include "util/util_map.h"
@@ -82,6 +84,8 @@ typedef struct KernelGlobals {
 
 	int2 global_size;
 	int2 global_id;
+
+	ProfilingState profiler;
 } KernelGlobals;
 
 #endif  /* __KERNEL_CPU__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 1f5929e4938..08e9db05c39 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -376,6 +376,9 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg,
                                            int sample,
                                            PathRadiance *L)
 {
+	PROFILING_INIT(kg, PROFILING_WRITE_RESULT);
+	PROFILING_OBJECT(PRIM_NONE);
+
 	float alpha;
 	float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
 
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index cb1f410b09f..a1fc6028293 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -57,6 +57,8 @@ ccl_device_forceinline bool kernel_path_scene_intersect(
 	Intersection *isect,
 	PathRadiance *L)
 {
+	PROFILING_INIT(kg, PROFILING_SCENE_INTERSECT);
+
 	uint visibility = path_state_ray_visibility(kg, state);
 
 	if(path_state_ao_bounce(kg, state)) {
@@ -105,6 +107,8 @@ ccl_device_forceinline void kernel_path_lamp_emission(
 	ShaderData *emission_sd,
 	PathRadiance *L)
 {
+	PROFILING_INIT(kg, PROFILING_INDIRECT_EMISSION);
+
 #ifdef __LAMP_MIS__
 	if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
 		/* ray starting from previous non-transparent bounce */
@@ -172,6 +176,8 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
 	ShaderData *emission_sd,
 	PathRadiance *L)
 {
+	PROFILING_INIT(kg, PROFILING_VOLUME);
+
 	/* Sanitize volume stack. */
 	if(!hit) {
 		kernel_volume_clean_stack(kg, state->volume_stack);
@@ -278,6 +284,8 @@ ccl_device_forceinline bool kernel_path_shader_apply(
 	PathRadiance *L,
 	ccl_global float *buffer)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_APPLY);
+
 #ifdef __SHADOW_TRICKS__
 	if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
 		if(state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) {
@@ -355,6 +363,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         float3 throughput,
                                         float3 ao_alpha)
 {
+	PROFILING_INIT(kg, PROFILING_AO);
+
 	/* todo: solve correlation */
 	float bsdf_u, bsdf_v;
 
@@ -568,6 +578,8 @@ ccl_device_forceinline void kernel_path_integrate(
 	ccl_global float *buffer,
 	ShaderData *emission_sd)
 {
+	PROFILING_INIT(kg, PROFILING_PATH_INTEGRATE);
+
 	/* Shader data memory used for both volumes and surfaces, saves stack space. */
 	ShaderData sd;
 
@@ -719,6 +731,8 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	ccl_global float *buffer,
 	int sample, int x, int y, int offset, int stride)
 {
+	PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
 	/* buffer offset */
 	int index = offset + x + y*stride;
 	int pass_stride = kernel_data.film.pass_stride;
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
index ff4a8a9d580..962776f21c1 100644
--- a/intern/cycles/kernel/kernel_path_subsurface.h
+++ b/intern/cycles/kernel/kernel_path_subsurface.h
@@ -32,6 +32,8 @@ bool kernel_path_subsurface_scatter(
         ccl_addr_space float3 *throughput,
         ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
 {
+	PROFILING_INIT(kg, PROFILING_SUBSURFACE);
+
 	float bssrdf_u, bssrdf_v;
 	path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index c9202ccb16d..0d18a1e8c77 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -217,6 +217,8 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
 	ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state,
 	PathRadiance *L)
 {
+	PROFILING_INIT(kg, PROFILING_CONNECT_LIGHT);
+
 #ifdef __EMISSION__
 	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
 		return;
@@ -274,6 +276,8 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
                                            PathRadianceState *L_state,
                                            ccl_addr_space Ray *ray)
 {
+	PROFILING_INIT(kg, PROFILING_SURFACE_BOUNCE);
+
 	/* no BSDF? we can stop here */
 	if(sd->flag & SD_BSDF) {
 		/* sample BSDF */
diff --git a/intern/cycles/kernel/kernel_profiling.h b/intern/cycles/kernel/kernel_profiling.h
new file mode 100644
index 00000000000..a46d6376473
--- /dev/null
+++ b/intern/cycles/kernel/kernel_profiling.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_PROFILING_H__
+#define __KERNEL_PROFILING_H__
+
+#ifdef __KERNEL_CPU__
+#  include "util/util_profiling.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_CPU__
+#  define PROFILING_INIT(kg, event) ProfilingHelper profiling_helper(&kg->profiler, event)
+#  define PROFILING_EVENT(event) profiling_helper.set_event(event)
+#  define PROFILING_SHADER(shader) if((shader) != SHADER_NONE) { profiling_helper.set_shader((shader) & SHADER_MASK); }
+#  define PROFILING_OBJECT(object) if((object) != PRIM_NONE) { profiling_helper.set_object(object); }
+#else
+#  define PROFILING_INIT(kg, event)
+#  define PROFILING_EVENT(event)
+#  define PROFILING_SHADER(shader)
+#  define PROFILING_OBJECT(object)
+#endif  /* __KERNEL_CPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_PROFILING_H__ */
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index af883aa715b..4b2e675bb21 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -54,6 +54,8 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
                                                const Intersection *isect,
                                                const Ray *ray)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 #ifdef __INSTANCING__
 	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
@@ -147,6 +149,9 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 	differential_incoming(&sd->dI, ray->dD);
 	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 #endif
+
+	PROFILING_SHADER(sd->shader);
+	PROFILING_OBJECT(sd->object);
 }
 
 /* ShaderData setup from BSSRDF scatter */
@@ -163,6 +168,8 @@ void shader_setup_from_subsurface(
         const Intersection *isect,
         const Ray *ray)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 	const bool backfacing = sd->flag & SD_BACKFACING;
 
 	/* object, matrices, time, ray_length stay the same */
@@ -233,6 +240,8 @@ void shader_setup_from_subsurface(
 	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 	/* don't modify dP and dI */
 #  endif
+
+	PROFILING_SHADER(sd->shader);
 }
 #endif
 
@@ -249,6 +258,8 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
                                                 bool object_space,
                                                 int lamp)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 	/* vectors */
 	sd->P = P;
 	sd->N = Ng;
@@ -353,6 +364,9 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
 	sd->du = differential_zero();
 	sd->dv = differential_zero();
 #endif
+
+	PROFILING_SHADER(sd->shader);
+	PROFILING_OBJECT(sd->object);
 }
 
 /* ShaderData setup for displacement */
@@ -380,6 +394,8 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 	/* vectors */
 	sd->P = ray->D;
 	sd->N = -ray->D;
@@ -414,6 +430,9 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
 	sd->du = differential_zero();
 	sd->dv = differential_zero();
 #endif
+
+	PROFILING_SHADER(sd->shader);
+	PROFILING_OBJECT(sd->object);
 }
 
 /* ShaderData setup from point inside volume */
@@ -421,6 +440,8 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
 #ifdef __VOLUME__
 ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 	/* vectors */
 	sd->P = ray->P;
 	sd->N = -ray->D;
@@ -461,6 +482,9 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 	/* for NDC coordinates */
 	sd->ray_P = ray->P;
 	sd->ray_dP = ray->dP;
+
+	PROFILING_SHADER(sd->shader);
+	PROFILING_OBJECT(sd->object);
 }
 #endif  /* __VOLUME__ */
 
@@ -591,6 +615,8 @@ void shader_bsdf_eval(KernelGlobals *kg,
                       float light_pdf,
                       bool use_mis)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_EVAL);
+
 	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
 
 #ifdef __BRANCHED_PATH__
@@ -720,6 +746,8 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
                                          differential3 *domega_in,
                                          float *pdf)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+
 	const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
 	if(sc == NULL) {
 		*pdf = 0.0f;
@@ -751,6 +779,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd,
 	const ShaderClosure *sc, float randu, float randv, BsdfEval *bsdf_eval,
 	float3 *omega_in, differential3 *domega_in, float *pdf)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+
 	int label;
 	float3 eval;
 
@@ -984,6 +1014,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 	ccl_addr_space PathState *state, int path_flag)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_EVAL);
+
 	/* If path is being terminated, we are tracing a shadow ray or evaluating
 	 * emission, then we don't need to store closures. The emission and shadow
 	 * shader data also do not have a closure array to save GPU memory. */
@@ -1084,6 +1116,8 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
 ccl_device void shader_volume_phase_eval(KernelGlobals *kg, const ShaderData *sd,
 	const float3 omega_in, BsdfEval *eval, float *pdf)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_EVAL);
+
 	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
 
 	_shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f);
@@ -1093,6 +1127,8 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s
 	float randu, float randv, BsdfEval *phase_eval,
 	float3 *omega_in, differential3 *domega_in, float *pdf)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
+
 	int sampled = 0;
 
 	if(sd->num_closure > 1) {
@@ -1151,6 +1187,8 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData *
 	const ShaderClosure *sc, float randu, float randv, BsdfEval *phase_eval,
 	float3 *omega_in, differential3 *domega_in, float *pdf)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
+
 	int label;
 	float3 eval;
 
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index dc7a1043208..b34d16c438b 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -335,6 +335,11 @@ uint Object::visibility_for_tracing() const {
 	return trace_visibility;
 }
 
+int Object::get_device_index() const
+{
+	return index;
+}
+
 /* Object Manager */
 
 ObjectManager::ObjectManager()
@@ -348,10 +353,9 @@ ObjectManager::~ObjectManager()
 }
 
 void ObjectManager::device_update_object_transform(UpdateObjectTransformState *state,
-                                                   Object *ob,
-                                                   int object_index)
+                                                   Object *ob)
 {
-	KernelObject& kobject = state->objects[object_index];
+	KernelObject& kobject = state->objects[ob->index];
 	Transform *object_motion_pass = state->object_motion_pass;
 
 	Mesh *mesh = ob->mesh;
@@ -457,13 +461,13 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
 			tfm_post = tfm_post * itfm;
 		}
 
-		int motion_pass_offset = object_index*OBJECT_MOTION_PASS_SIZE;
+		int motion_pass_offset = ob->index*OBJECT_MOTION_PASS_SIZE;
 		object_motion_pass[motion_pass_offset + 0] = tfm_pre;
 		object_motion_pass[motion_pass_offset + 1] = tfm_post;
 	}
 	else if(state->need_motion == Scene::MOTION_BLUR) {
 		if(ob->use_motion()) {
-			kobject.motion_offset = state->motion_offset[object_index];
+			kobject.motion_offset = state->motion_offset[ob->index];
 
 			/* Decompose transforms for interpolation. */
 			DecomposedTransform *decomp = state->object_motion + kobject.motion_offset;
@@ -494,7 +498,7 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
 	if(ob->use_holdout) {
 		flag |= SD_OBJECT_HOLDOUT_MASK;
 	}
-	state->object_flag[object_index] = flag;
+	state->object_flag[ob->index] = flag;
 
 	/* Have curves. */
 	if(mesh->num_curves()) {
@@ -538,7 +542,7 @@ void ObjectManager::device_update_object_transform_task(
 		for(int i = 0; i < num_objects; ++i) {
 			const int object_index = start_index + i;
 			Object *ob = state->scene->objects[object_index];
-			device_update_object_transform(state, ob, object_index);
+			device_update_object_transform(state, ob);
 		}
 	}
 }
@@ -593,10 +597,8 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene,
 	 * need some tweaks to make mid-complex scenes optimal.
 	 */
 	if(scene->objects.size() < 64) {
-		int object_index = 0;
 		foreach(Object *ob, scene->objects) {
-			device_update_object_transform(&state, ob, object_index);
-			object_index++;
+			device_update_object_transform(&state, ob);
 			if(progress.get_cancel()) {
 				return;
 			}
@@ -642,6 +644,12 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc
 	if(scene->objects.size() == 0)
 		return;
 
+	/* Assign object IDs. */
+	int index = 0;
+	foreach(Object *object, scene->objects) {
+		object->index = index++;
+	}
+
 	/* set object transform matrices, before applying static transforms */
 	progress.set_status("Updating Objects", "Copying Transformations to device");
 	device_update_transforms(dscene, scene, progress);
@@ -686,26 +694,25 @@ void ObjectManager::device_update_flags(Device *,
 		}
 	}
 
-	int object_index = 0;
 	foreach(Object *object, scene->objects) {
 		if(object->mesh->has_volume) {
-			object_flag[object_index] |= SD_OBJECT_HAS_VOLUME;
-			object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
+			object_flag[object->index] |= SD_OBJECT_HAS_VOLUME;
+			object_flag[object->index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
 
 			foreach(Attribute& attr, object->mesh->attributes.attributes) {
 				if(attr.element == ATTR_ELEMENT_VOXEL) {
-					object_flag[object_index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
+					object_flag[object->index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
 				}
 			}
 		}
 		else {
-			object_flag[object_index] &= ~(SD_OBJECT_HAS_VOLUME|SD_OBJECT_HAS_VOLUME_ATTRIBUTES);
+			object_flag[object->index] &= ~(SD_OBJECT_HAS_VOLUME|SD_OBJECT_HAS_VOLUME_ATTRIBUTES);
 		}
 		if(object->is_shadow_catcher) {
-			object_flag[object_index] |= SD_OBJECT_SHADOW_CATCHER;
+			object_flag[object->index] |= SD_OBJECT_SHADOW_CATCHER;
 		}
 		else {
-			object_flag[object_index] &= ~SD_OBJECT_SHADOW_CATCHER;
+			object_flag[object->index] &= ~SD_OBJECT_SHADOW_CATCHER;
 		}
 
 		if(bounds_valid) {
@@ -714,7 +721,7 @@ void ObjectManager::device_update_flags(Device *,
 					continue;
 				}
 				if(object->bounds.intersects(volume_object->bounds)) {
-					object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+					object_flag[object->index] |= SD_OBJECT_INTERSECTS_VOLUME;
 					break;
 				}
 			}
@@ -723,9 +730,8 @@ void ObjectManager::device_update_flags(Device *,
 			/* Not really valid, but can't make more reliable in the case
 			 * of bounds not being up to date.
 			 */
-			object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+			object_flag[object->index] |= SD_OBJECT_INTERSECTS_VOLUME;
 		}
-		++object_index;
 	}
 
 	/* Copy object flag. */
@@ -741,7 +747,6 @@ void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Sc
 	KernelObject *kobjects = dscene->objects.data();
 
 	bool update = false;
-	int object_index = 0;
 
 	foreach(Object *object, scene->objects) {
 		Mesh* mesh = object->mesh;
@@ -750,18 +755,16 @@ void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Sc
 			uint patch_map_offset = 2*(mesh->patch_table_offset + mesh->patch_table->total_size() -
 			                           mesh->patch_table->num_nodes * PATCH_NODE_SIZE) - mesh->patch_offset;
 
-			if(kobjects[object_index].patch_map_offset != patch_map_offset) {
-				kobjects[object_index].patch_map_offset = patch_map_offset;
+			if(kobjects[object->index].patch_map_offset != patch_map_offset) {
+				kobjects[object->index].patch_map_offset = patch_map_offset;
 				update = true;
 			}
 		}
 
-		if(kobjects[object_index].attribute_map_offset != mesh->attr_map_offset) {
-			kobjects[object_index].attribute_map_offset = mesh->attr_map_offset;
+		if(kobjects[object->index].attribute_map_offset != mesh->attr_map_offset) {
+			kobjects[object->index].attribute_map_offset = mesh->attr_map_offset;
 			update = true;
 		}
-
-		object_index++;
 	}
 
 	if(update) {
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 87e6e6652ad..134f0bc3577 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -38,6 +38,7 @@ class Progress;
 class Scene;
 struct Transform;
 struct UpdateObjectTransformState;
+class ObjectManager;
 
 /* Object */
 
@@ -88,6 +89,16 @@ public:
 	 * determined flags which denotes trace-time visibility.
 	 */
 	uint visibility_for_tracing() const;
+
+	/* Returns the index that is used in the kernel for this object. */
+	int get_device_index() const;
+
+protected:
+	/* Specifies the position of the object in scene->objects and
+	 * in the device vectors. Gets set in device_update. */
+	int index;
+
+	friend class ObjectManager;
 };
 
 /* Object Manager */
@@ -123,8 +134,7 @@ public:
 
 protected:
 	void device_update_object_transform(UpdateObjectTransformState *state,
-	                                    Object *ob,
-	                                    const int object_index);
+	                                    Object *ob);
 	void device_update_object_transform_task(UpdateObjectTransformState *state);
 	bool device_update_object_transform_pop_work(
 	        UpdateObjectTransformState *state,
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index d6ecafa19b7..a8374b662d0 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -250,7 +250,9 @@ void Session::run_gpu()
 		if(!no_tiles) {
 			/* update scene */
 			scoped_timer update_timer;
-			update_scene();
+			if(update_scene()) {
+				stats.profiler.reset(scene->shaders.size(), scene->objects.size());
+			}
 			progress.add_skip_time(update_timer, params.background);
 
 			if(!device->error_message().empty())
@@ -585,7 +587,9 @@ void Session::run_cpu()
 
 			/* update scene */
 			scoped_timer update_timer;
-			update_scene();
+			if(update_scene()) {
+				stats.profiler.reset(scene->shaders.size(), scene->objects.size());
+			}
 			progress.add_skip_time(update_timer, params.background);
 
 			if(!device->error_message().empty())
@@ -729,6 +733,10 @@ void Session::run()
 	/* load kernels */
 	load_kernels();
 
+	if(params.use_profiling && (params.device.type == DEVICE_CPU)) {
+		stats.profiler.start();
+	}
+
 	/* session thread loop */
 	progress.set_status("Waiting for render to start");
 
@@ -743,6 +751,8 @@ void Session::run()
 			run_cpu();
 	}
 
+	stats.profiler.stop();
+
 	/* progress update */
 	if(progress.get_cancel())
 		progress.set_status("Cancel", progress.get_cancel_message());
@@ -825,7 +835,7 @@ void Session::wait()
 	session_thread = NULL;
 }
 
-void Session::update_scene()
+bool Session::update_scene()
 {
 	thread_scoped_lock scene_lock(scene->mutex);
 
@@ -876,7 +886,10 @@ void Session::update_scene()
 
 		progress.set_status("Updating Scene");
 		MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
+
+		return true;
 	}
+	return false;
 }
 
 void Session::update_status_time(bool show_pause, bool show_done)
@@ -1052,6 +1065,14 @@ void Session::device_free()
 	 */
 }
 
+void Session::collect_statistics(RenderStats *render_stats)
+{
+	scene->collect_statistics(render_stats);
+	if(params.use_profiling && (params.device.type == DEVICE_CPU)) {
+		render_stats->collect_profiling(scene, &stats);
+	}
+}
+
 int Session::get_max_closure_count()
 {
 	int max_closures = 0;
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 56a69919a7a..db07c758427 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -20,6 +20,7 @@
 #include "render/buffers.h"
 #include "device/device.h"
 #include "render/shader.h"
+#include "render/stats.h"
 #include "render/tile.h"
 
 #include "util/util_progress.h"
@@ -55,6 +56,8 @@ public:
 	int pixel_size;
 	int threads;
 
+	bool use_profiling;
+
 	bool display_buffer_linear;
 
 	bool use_denoising;
@@ -89,6 +92,8 @@ public:
 		pixel_size = 1;
 		threads = 0;
 
+		use_profiling = false;
+
 		use_denoising = false;
 		denoising_passes = false;
 		denoising_radius = 8;
@@ -118,6 +123,7 @@ public:
 		&& start_resolution == params.start_resolution
 		&& pixel_size == params.pixel_size
 		&& threads == params.threads
+		&& use_profiling == params.use_profiling
 		&& display_buffer_linear == params.display_buffer_linear
 		&& cancel_timeout == params.cancel_timeout
 		&& reset_timeout == params.reset_timeout
@@ -159,7 +165,7 @@ public:
 	void set_samples(int samples);
 	void set_pause(bool pause);
 
-	void update_scene();
+	bool update_scene();
 	void load_kernels(bool lock_scene=true);
 
 	void device_free();
@@ -168,6 +174,8 @@ public:
 	 * (for example, when rendering with unlimited samples). */
 	float get_progress();
 
+	void collect_statistics(RenderStats *stats);
+
 protected:
 	struct DelayedReset {
 		thread_mutex mutex;
diff --git a/intern/cycles/render/stats.cpp b/intern/cycles/render/stats.cpp
index 101d33fcf65..71d9353ad3d 100644
--- a/intern/cycles/render/stats.cpp
+++ b/intern/cycles/render/stats.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "render/stats.h"
+#include "render/object.h"
 #include "util/util_algorithm.h"
 #include "util/util_foreach.h"
 #include "util/util_string.h"
@@ -33,6 +34,16 @@ bool namedSizeEntryComparator(const NamedSizeEntry& a, const NamedSizeEntry& b)
 	return a.size > b.size;
 }
 
+bool namedTimeSampleEntryComparator(const NamedNestedSampleStats& a, const NamedNestedSampleStats& b)
+{
+	return a.sum_samples > b.sum_samples;
+}
+
+bool namedSampleCountPairComparator(const NamedSampleCountPair& a, const NamedSampleCountPair& b)
+{
+	return a.samples > b.samples;
+}
+
 }  // namespace
 
 NamedSizeEntry::NamedSizeEntry()
@@ -77,6 +88,113 @@ string NamedSizeStats::full_report(int indent_level)
 	return result;
 }
 
+/* Named time sample statistics. */
+
+NamedNestedSampleStats::NamedNestedSampleStats()
+ : name(""), self_samples(0), sum_samples(0)
+{}
+
+NamedNestedSampleStats::NamedNestedSampleStats(const string& name, uint64_t samples)
+ : name(name), self_samples(samples), sum_samples(samples)
+{}
+
+NamedNestedSampleStats& NamedNestedSampleStats::add_entry(const string& name_, uint64_t samples_)
+{
+	entries.push_back(NamedNestedSampleStats(name_, samples_));
+	return entries[entries.size()-1];
+}
+
+void NamedNestedSampleStats::update_sum()
+{
+	sum_samples = self_samples;
+	foreach(NamedNestedSampleStats& entry, entries) {
+		entry.update_sum();
+		sum_samples += entry.sum_samples;
+	}
+}
+
+string NamedNestedSampleStats::full_report(int indent_level, uint64_t total_samples)
+{
+	update_sum();
+
+	if(total_samples == 0) {
+		total_samples = sum_samples;
+	}
+
+	const string indent(indent_level * kIndentNumSpaces, ' ');
+
+	const double sum_percent = 100*((double) sum_samples) / total_samples;
+	const double sum_seconds = sum_samples * 0.001;
+	const double self_percent = 100*((double) self_samples) / total_samples;
+	const double self_seconds = self_samples * 0.001;
+	string info = string_printf("%-32s: Total %3.2f%% (%.2fs), Self %3.2f%% (%.2fs)\n",
+	                            name.c_str(),
+	                            sum_percent,
+	                            sum_seconds,
+	                            self_percent,
+	                            self_seconds);
+	string result = indent + info; 
+
+	sort(entries.begin(), entries.end(), namedTimeSampleEntryComparator);
+	foreach(NamedNestedSampleStats& entry, entries) {
+		result += entry.full_report(indent_level + 1, total_samples);
+	}
+	return result;
+}
+
+/* Named sample count pairs. */
+
+NamedSampleCountPair::NamedSampleCountPair(const ustring& name, uint64_t samples, uint64_t hits)
+ : name(name), samples(samples), hits(hits)
+{}
+
+NamedSampleCountStats::NamedSampleCountStats()
+{}
+
+void NamedSampleCountStats::add(const ustring& name, uint64_t samples, uint64_t hits)
+{
+	entry_map::iterator entry = entries.find(name);
+	if(entry != entries.end()) {
+		entry->second.samples += samples;
+		entry->second.hits += hits;
+		return;
+	}
+	entries.emplace(name, NamedSampleCountPair(name, samples, hits));
+}
+
+string NamedSampleCountStats::full_report(int indent_level)
+{
+	const string indent(indent_level * kIndentNumSpaces, ' ');
+
+	vector<NamedSampleCountPair> sorted_entries;
+	sorted_entries.reserve(entries.size());
+
+	uint64_t total_hits = 0, total_samples = 0;
+	foreach(entry_map::const_reference entry, entries) {
+		const NamedSampleCountPair &pair = entry.second;
+
+		total_hits += pair.hits;
+		total_samples += pair.samples;
+
+		sorted_entries.push_back(pair);
+	}
+	const double avg_samples_per_hit = ((double) total_samples) / total_hits;
+
+	sort(sorted_entries.begin(), sorted_entries.end(), namedSampleCountPairComparator);
+
+	string result = "";
+	foreach(const NamedSampleCountPair& entry, sorted_entries) {
+		const double seconds = entry.samples * 0.001;
+		const double relative = ((double) entry.samples) / (entry.hits * avg_samples_per_hit);
+
+		result += indent + string_printf("%-32s: %.2fs (Relative cost: %.2f)\n",
+		                                 entry.name.c_str(),
+		                                 seconds,
+		                                 relative);
+	}
+	return result;
+}
+
 /* Mesh statistics. */
 
 MeshStats::MeshStats() {
@@ -106,6 +224,74 @@ string ImageStats::full_report(int indent_level)
 /* Overall statistics. */
 
 RenderStats::RenderStats() {
+	has_profiling = false;
+}
+
+void RenderStats::collect_profiling(Scene *scene, Stats *stats)
+{
+	has_profiling = true;
+
+	Profiler &prof = stats->profiler;
+
+	kernel = NamedNestedSampleStats("Total render time", prof.get_event(PROFILING_UNKNOWN));
+
+	kernel.add_entry("Ray setup", prof.get_event(PROFILING_RAY_SETUP));
+	kernel.add_entry("Result writing", prof.get_event(PROFILING_WRITE_RESULT));
+
+	NamedNestedSampleStats &integrator = kernel.add_entry("Path integration", prof.get_event(PROFILING_PATH_INTEGRATE));
+	integrator.add_entry("Scene intersection", prof.get_event(PROFILING_SCENE_INTERSECT));
+	integrator.add_entry("Indirect emission", prof.get_event(PROFILING_INDIRECT_EMISSION));
+	integrator.add_entry("Volumes", prof.get_event(PROFILING_VOLUME));
+
+	NamedNestedSampleStats &shading = integrator.add_entry("Shading", 0);
+	shading.add_entry("Shader Setup", prof.get_event(PROFILING_SHADER_SETUP));
+	shading.add_entry("Shader Eval", prof.get_event(PROFILING_SHADER_EVAL));
+	shading.add_entry("Shader Apply", prof.get_event(PROFILING_SHADER_APPLY));
+	shading.add_entry("Ambient Occlusion", prof.get_event(PROFILING_AO));
+	shading.add_entry("Subsurface", prof.get_event(PROFILING_SUBSURFACE));
+
+	integrator.add_entry("Connect Light", prof.get_event(PROFILING_CONNECT_LIGHT));
+	integrator.add_entry("Surface Bounce", prof.get_event(PROFILING_SURFACE_BOUNCE));
+
+	NamedNestedSampleStats &intersection = kernel.add_entry("Intersection", 0);
+	intersection.add_entry("Full Intersection", prof.get_event(PROFILING_INTERSECT));
+	intersection.add_entry("Local Intersection", prof.get_event(PROFILING_INTERSECT_LOCAL));
+	intersection.add_entry("Shadow All Intersection", prof.get_event(PROFILING_INTERSECT_SHADOW_ALL));
+	intersection.add_entry("Volume Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME));
+	intersection.add_entry("Volume All Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME_ALL));
+
+	NamedNestedSampleStats &closure = kernel.add_entry("Closures", 0);
+	closure.add_entry("Surface Closure Evaluation", prof.get_event(PROFILING_CLOSURE_EVAL));
+	closure.add_entry("Surface Closure Sampling", prof.get_event(PROFILING_CLOSURE_SAMPLE));
+	closure.add_entry("Volume Closure Evaluation", prof.get_event(PROFILING_CLOSURE_VOLUME_EVAL));
+	closure.add_entry("Volume Closure Sampling", prof.get_event(PROFILING_CLOSURE_VOLUME_SAMPLE));
+
+	NamedNestedSampleStats &denoising = kernel.add_entry("Denoising", prof.get_event(PROFILING_DENOISING));
+	denoising.add_entry("Construct Transform", prof.get_event(PROFILING_DENOISING_CONSTRUCT_TRANSFORM));
+	denoising.add_entry("Reconstruct", prof.get_event(PROFILING_DENOISING_RECONSTRUCT));
+
+	NamedNestedSampleStats &prefilter = denoising.add_entry("Prefiltering", 0);
+	prefilter.add_entry("Divide Shadow", prof.get_event(PROFILING_DENOISING_DIVIDE_SHADOW));
+	prefilter.add_entry("Non-Local means", prof.get_event(PROFILING_DENOISING_NON_LOCAL_MEANS));
+	prefilter.add_entry("Get Feature", prof.get_event(PROFILING_DENOISING_GET_FEATURE));
+	prefilter.add_entry("Detect Outliers", prof.get_event(PROFILING_DENOISING_DETECT_OUTLIERS));
+	prefilter.add_entry("Combine Halves", prof.get_event(PROFILING_DENOISING_COMBINE_HALVES));
+
+	shaders.entries.clear();
+	foreach(Shader *shader, scene->shaders) {
+		uint64_t samples, hits;
+		if(prof.get_shader(shader->id, samples, hits)) {
+			shaders.add(shader->name, samples, hits);
+		}
+	}
+
+	objects.entries.clear();
+	foreach(Object *object, scene->objects) {
+		uint64_t samples, hits;
+		if(prof.get_object(object->get_device_index(), samples, hits)) {
+			objects.add(object->name, samples, hits);
+		}
+	}
 }
 
 string RenderStats::full_report()
@@ -113,6 +299,14 @@ string RenderStats::full_report()
 	string result = "";
 	result += "Mesh statistics:\n" + mesh.full_report(1);
 	result += "Image statistics:\n" + image.full_report(1);
+	if(has_profiling) {
+		result += "Kernel statistics:\n" + kernel.full_report(1);
+		result += "Shader statistics:\n" + shaders.full_report(1);
+		result += "Object statistics:\n" + objects.full_report(1);
+	}
+	else {
+		result += "Profiling information not available (only works with CPU rendering)";
+	}
 	return result;
 }
 
diff --git a/intern/cycles/render/stats.h b/intern/cycles/render/stats.h
index 2ff0ec3e0e9..ce0529bb8bd 100644
--- a/intern/cycles/render/stats.h
+++ b/intern/cycles/render/stats.h
@@ -17,6 +17,9 @@
 #ifndef __RENDER_STATS_H__
 #define __RENDER_STATS_H__
 
+#include "render/scene.h"
+
+#include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_vector.h"
 
@@ -61,6 +64,51 @@ public:
 	vector<NamedSizeEntry> entries;
 };
 
+class NamedNestedSampleStats {
+public:
+	NamedNestedSampleStats();
+	NamedNestedSampleStats(const string& name, uint64_t samples);
+
+	NamedNestedSampleStats& add_entry(const string& name, uint64_t samples);
+
+	/* Updates sum_samples recursively. */
+	void update_sum();
+
+	string full_report(int indent_level = 0, uint64_t total_samples = 0);
+
+	string name;
+
+	/* self_samples contains only the samples that this specific event got,
+	 * while sum_samples also includes the samples of all sub-entries. */
+	uint64_t self_samples, sum_samples;
+
+	vector<NamedNestedSampleStats> entries;
+};
+
+/* Named entry containing both a time-sample count for objects of a type and a
+ * total count of processed items.
+ * This allows to estimate the time spent per item. */
+class NamedSampleCountPair {
+public:
+	NamedSampleCountPair(const ustring& name, uint64_t samples, uint64_t hits);
+
+	ustring name;
+	uint64_t samples;
+	uint64_t hits;
+};
+
+/* Contains statistics about pairs of samples and counts as described above. */
+class NamedSampleCountStats {
+public:
+	NamedSampleCountStats();
+
+	string full_report(int indent_level = 0);
+	void add(const ustring& name, uint64_t samples, uint64_t hits);
+
+	typedef unordered_map<ustring, NamedSampleCountPair, ustringHash> entry_map;
+	entry_map entries;
+};
+
 /* Statistics about mesh in the render database. */
 class MeshStats {
 public:
@@ -95,8 +143,16 @@ public:
 	/* Return full report as string. */
 	string full_report();
 
+	/* Collect kernel sampling information from Stats. */
+	void collect_profiling(Scene *scene, Stats *stats);
+
+	bool has_profiling;
+
 	MeshStats mesh;
 	ImageStats image;
+	NamedNestedSampleStats kernel;
+	NamedSampleCountStats shaders;
+	NamedSampleCountStats objects;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 77d47984ee7..92dfc9fa85d 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -17,6 +17,7 @@ set(SRC
 	util_md5.cpp
 	util_murmurhash.cpp
 	util_path.cpp
+	util_profiling.cpp
 	util_string.cpp
 	util_simd.cpp
 	util_system.cpp
@@ -71,6 +72,7 @@ set(SRC_HEADERS
 	util_optimization.h
 	util_param.h
 	util_path.h
+	util_profiling.h
 	util_progress.h
 	util_projection.h
 	util_queue.h
diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp
new file mode 100644
index 00000000000..30aaef69310
--- /dev/null
+++ b/intern/cycles/util/util_profiling.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_algorithm.h"
+#include "util/util_profiling.h"
+#include "util/util_set.h"
+
+CCL_NAMESPACE_BEGIN
+
+Profiler::Profiler()
+ : do_stop_worker(true), worker(NULL)
+{
+}
+
+Profiler::~Profiler()
+{
+	assert(worker == NULL);
+}
+
+void Profiler::run()
+{
+	uint64_t updates = 0;
+	auto start_time = std::chrono::system_clock::now();
+	while(!do_stop_worker) {
+		thread_scoped_lock lock(mutex);
+		foreach(ProfilingState *state, states) {
+			uint32_t cur_event = state->event;
+			int32_t cur_shader = state->shader;
+			int32_t cur_object = state->object;
+
+			/* The state reads/writes should be atomic, but just to be sure
+			 * check the values for validity anyways. */
+			if(cur_event < PROFILING_NUM_EVENTS) {
+				event_samples[cur_event]++;
+			}
+
+			if(cur_shader >= 0 && cur_shader < shader_samples.size()) {
+				/* Only consider the active shader during events whose runtime significantly depends on it. */
+				if(((cur_event >= PROFILING_SHADER_EVAL ) && (cur_event <= PROFILING_SUBSURFACE)) ||
+				   ((cur_event >= PROFILING_CLOSURE_EVAL) && (cur_event <= PROFILING_CLOSURE_VOLUME_SAMPLE))) {
+					shader_samples[cur_shader]++;
+				}
+			}
+
+			if(cur_object >= 0 && cur_object < object_samples.size()) {
+				object_samples[cur_object]++;
+			}
+		}
+		lock.unlock();
+
+		/* Relative waits always overshoot a bit, so just waiting 1ms every
+		 * time would cause the sampling to drift over time.
+		 * By keeping track of the absolute time, the wait times correct themselves -
+		 * if one wait overshoots a lot, the next one will be shorter to compensate. */
+		updates++;
+		std::this_thread::sleep_until(start_time + updates*std::chrono::milliseconds(1));
+	}
+}
+
+void Profiler::reset(int num_shaders, int num_objects)
+{
+	bool running = (worker != NULL);
+	if(running) {
+		stop();
+	}
+
+	/* Resize and clear the accumulation vectors. */
+	shader_hits.assign(num_shaders, 0);
+	object_hits.assign(num_objects, 0);
+
+	event_samples.assign(PROFILING_NUM_EVENTS, 0);
+	shader_samples.assign(num_shaders, 0);
+	object_samples.assign(num_objects, 0);
+
+	if(running) {
+		start();
+	}
+}
+
+void Profiler::start()
+{
+	assert(worker == NULL);
+	do_stop_worker = false;
+	worker = new thread(function_bind(&Profiler::run, this));
+}
+
+void Profiler::stop()
+{
+	if(worker != NULL) {
+		do_stop_worker = true;
+
+		worker->join();
+		delete worker;
+		worker = NULL;
+	}
+}
+
+void Profiler::add_state(ProfilingState *state)
+{
+	thread_scoped_lock lock(mutex);
+
+	/* Add the ProfilingState from the list of sampled states. */
+	assert(std::find(states.begin(), states.end(), state) == states.end());
+	states.push_back(state);
+
+	/* Resize thread-local hit counters. */
+	state->shader_hits.assign(shader_hits.size(), 0);
+	state->object_hits.assign(object_hits.size(), 0);
+
+	/* Initialize the state. */
+	state->event = PROFILING_UNKNOWN;
+	state->shader = -1;
+	state->object = -1;
+	state->active = true;
+}
+
+void Profiler::remove_state(ProfilingState *state)
+{
+	thread_scoped_lock lock(mutex);
+
+	/* Remove the ProfilingState from the list of sampled states. */
+	states.erase(std::remove(states.begin(), states.end(), state), states.end());
+	state->active = false;
+
+	/* Merge thread-local hit counters. */
+	assert(shader_hits.size() == state->shader_hits.size());
+	for(int i = 0; i < shader_hits.size(); i++) {
+		shader_hits[i] += state->shader_hits[i];
+	}
+
+	assert(object_hits.size() == state->object_hits.size());
+	for(int i = 0; i < object_hits.size(); i++) {
+		object_hits[i] += state->object_hits[i];
+	}
+}
+
+uint64_t Profiler::get_event(ProfilingEvent event)
+{
+	assert(worker == NULL);
+	return event_samples[event];
+}
+
+bool Profiler::get_shader(int shader, uint64_t &samples, uint64_t &hits)
+{
+	assert(worker == NULL);
+	if(shader_samples[shader] == 0) {
+		return false;
+	}
+	samples = shader_samples[shader];
+	hits = shader_hits[shader];
+	return true;
+}
+
+bool Profiler::get_object(int object, uint64_t &samples, uint64_t &hits)
+{
+	assert(worker == NULL);
+	if(object_samples[object] == 0) {
+		return false;
+	}
+	samples = object_samples[object];
+	hits = object_hits[object];
+	return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h
new file mode 100644
index 00000000000..4460402b12d
--- /dev/null
+++ b/intern/cycles/util/util_profiling.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_PROFILING_H__
+#define __UTIL_PROFILING_H__
+
+#include <atomic>
+
+#include "util/util_foreach.h"
+#include "util/util_map.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum ccl_try_align(16) ProfilingEvent : uint32_t {
+	PROFILING_UNKNOWN,
+	PROFILING_RAY_SETUP,
+	PROFILING_PATH_INTEGRATE,
+	PROFILING_SCENE_INTERSECT,
+	PROFILING_INDIRECT_EMISSION,
+	PROFILING_VOLUME,
+	PROFILING_SHADER_SETUP,
+	PROFILING_SHADER_EVAL,
+	PROFILING_SHADER_APPLY,
+	PROFILING_AO,
+	PROFILING_SUBSURFACE,
+	PROFILING_CONNECT_LIGHT,
+	PROFILING_SURFACE_BOUNCE,
+	PROFILING_WRITE_RESULT,
+
+	PROFILING_INTERSECT,
+	PROFILING_INTERSECT_LOCAL,
+	PROFILING_INTERSECT_SHADOW_ALL,
+	PROFILING_INTERSECT_VOLUME,
+	PROFILING_INTERSECT_VOLUME_ALL,
+
+	PROFILING_CLOSURE_EVAL,
+	PROFILING_CLOSURE_SAMPLE,
+	PROFILING_CLOSURE_VOLUME_EVAL,
+	PROFILING_CLOSURE_VOLUME_SAMPLE,
+
+	PROFILING_DENOISING,
+	PROFILING_DENOISING_CONSTRUCT_TRANSFORM,
+	PROFILING_DENOISING_RECONSTRUCT,
+	PROFILING_DENOISING_DIVIDE_SHADOW,
+	PROFILING_DENOISING_NON_LOCAL_MEANS,
+	PROFILING_DENOISING_COMBINE_HALVES,
+	PROFILING_DENOISING_GET_FEATURE,
+	PROFILING_DENOISING_DETECT_OUTLIERS,
+
+	PROFILING_NUM_EVENTS,
+};
+
+/* Contains the current execution state of a worker thread.
+ * These values are constantly updated by the worker.
+ * Periodically the profiler thread will wake up, read them
+ * and update its internal counters based on it.
+ *
+ * Atomics aren't needed here since we're only doing direct
+ * writes and reads to (4-byte-aligned) uint32_t, which is
+ * guaranteed to be atomic on x86 since the 486.
+ * Memory ordering is not guaranteed but does not matter.
+ *
+ * And even on other architectures, the extremely rare corner
+ * case of reading an intermediate state could at worst result
+ * in a single incorrect sample. */
+struct ProfilingState {
+	volatile uint32_t event = PROFILING_UNKNOWN;
+	volatile int32_t shader = -1;
+	volatile int32_t object = -1;
+	volatile bool active = false;
+
+	vector<uint64_t> shader_hits;
+	vector<uint64_t> object_hits;
+};
+
+class Profiler {
+public:
+	Profiler();
+	~Profiler();
+
+	void reset(int num_shaders, int num_objects);
+
+	void start();
+	void stop();
+
+	void add_state(ProfilingState *state);
+	void remove_state(ProfilingState *state);
+
+	uint64_t get_event(ProfilingEvent event);
+	bool get_shader(int shader, uint64_t &samples, uint64_t &hits);
+	bool get_object(int object, uint64_t &samples, uint64_t &hits);
+
+protected:
+	void run();
+
+	/* Tracks how often the worker was in each ProfilingEvent while sampling,
+	 * so multiplying the values by the sample frequency (currently 1ms)
+	 * gives the approximate time spent in each state. */
+	vector<uint64_t> event_samples;
+	vector<uint64_t> shader_samples;
+	vector<uint64_t> object_samples;
+
+	/* Tracks the total amounts every object/shader was hit.
+	 * Used to evaluate relative cost, written by the render thread.
+	 * Indexed by the shader and object IDs that the kernel also uses
+	 * to index __object_flag and __shaders. */
+	vector<uint64_t> shader_hits;
+	vector<uint64_t> object_hits;
+
+	volatile bool do_stop_worker;
+	thread *worker;
+
+	thread_mutex mutex;
+	vector<ProfilingState*> states;
+};
+
+class ProfilingHelper {
+public:
+	ProfilingHelper(ProfilingState *state, ProfilingEvent event)
+	 : state(state)
+	{
+		previous_event = state->event;
+		state->event = event;
+	}
+
+	inline void set_event(ProfilingEvent event)
+	{
+		state->event = event;
+	}
+
+	inline void set_shader(int shader)
+	{
+		state->shader = shader;
+		if(state->active) {
+			assert(shader < state->shader_hits.size());
+			state->shader_hits[shader]++;
+		}
+	}
+
+	inline void set_object(int object)
+	{
+		state->object = object;
+		if(state->active) {
+			assert(object < state->object_hits.size());
+			state->object_hits[object]++;
+		}
+	}
+
+	~ProfilingHelper()
+	{
+		state->event = previous_event;
+	}
+private:
+	ProfilingState *state;
+	uint32_t previous_event;
+};
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_PROFILING_H__ */
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index 0ba58422a67..1128bc3ced1 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -18,6 +18,7 @@
 #define __UTIL_STATS_H__
 
 #include "util/util_atomic.h"
+#include "util/util_profiling.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -40,6 +41,8 @@ public:
 
 	size_t mem_used;
 	size_t mem_peak;
+
+	Profiler profiler;
 };
 
 CCL_NAMESPACE_END
author	Lukas Stockner <lukas.stockner@freenet.de>	2018-11-29 04:06:30 +0300
committer	Lukas Stockner <lukas.stockner@freenet.de>	2018-11-29 04:45:24 +0300
commit	7fa6f72084b1364cddfbef4f06bbb244210d6967 (patch)
tree	ae4f682248bd5ba4f716ff60c6dbd67c5684b3d2 /intern
parent	fb057153b05555606d801d1e942113d40ec15cec (diff)