26 files changed, 837 insertions, 36 deletions
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 75c7dcee05e..30ae0bc813d 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -516,7 +516,7 @@ void BlenderSession::render()
 
 		if(!b_engine.is_preview() && background && print_render_stats) {
 			RenderStats stats;
-			session->scene->collect_statistics(&stats);
+			session->collect_statistics(&stats);
 			printf("Render statistics:\n%s\n", stats.full_report().c_str());
 		}
 
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 832847c179f..49b046d0a88 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -874,7 +874,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 	}
 
 	/* tiles */
-	if(params.device.type != DEVICE_CPU && !background) {
+	const bool is_cpu = (params.device.type == DEVICE_CPU);
+	if(!is_cpu && !background) {
 		/* currently GPU could be much slower than CPU when using tiles,
 		 * still need to be investigated, but meanwhile make it possible
 		 * to work in viewport smoothly
@@ -960,6 +961,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 		params.progressive_update_timeout = 0.1;
 	}
 
+	params.use_profiling = params.device.has_profiling && !b_engine.is_preview() &&
+	                       background && BlenderSession::print_render_stats;
+
 	return params;
 }
 
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 7e20bb449c3..54ffd4bc4df 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -362,6 +362,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th
 	info.has_half_images = true;
 	info.has_volume_decoupled = true;
 	info.has_osl = true;
+	info.has_profiling = true;
 
 	foreach(const DeviceInfo &device, subdevices) {
 		/* Ensure CPU device does not slow down GPU. */
@@ -396,6 +397,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th
 		info.has_half_images &= device.has_half_images;
 		info.has_volume_decoupled &= device.has_volume_decoupled;
 		info.has_osl &= device.has_osl;
+		info.has_profiling &= device.has_profiling;
 	}
 
 	return info;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index f3fb338e638..071f61a7566 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -60,6 +60,7 @@ public:
 	bool has_volume_decoupled;      /* Decoupled volume shading. */
 	bool has_osl;                   /* Support Open Shading Language. */
 	bool use_split_kernel;          /* Use split or mega kernel. */
+	bool has_profiling;             /* Supports runtime collection of profiling info. */
 	int cpu_threads;
 	vector<DeviceInfo> multi_devices;
 
@@ -75,6 +76,7 @@ public:
 		has_volume_decoupled = false;
 		has_osl = false;
 		use_split_kernel = false;
+		has_profiling = false;
 	}
 
 	bool operator==(const DeviceInfo &info) {
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 36fe9bfc92b..f0a6fd6e3f4 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -477,6 +477,8 @@ public:
 	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
 	                               DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
+
 		int4 rect = task->rect;
 		int   r   = task->nlm_state.r;
 		int   f   = task->nlm_state.f;
@@ -529,6 +531,8 @@ public:
 
 	bool denoising_construct_transform(DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
+
 		for(int y = 0; y < task->filter_area.w; y++) {
 			for(int x = 0; x < task->filter_area.z; x++) {
 				filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
@@ -551,6 +555,8 @@ public:
 	                           device_ptr output_ptr,
 	                           DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
+
 		mem_zero(task->storage.XtWX);
 		mem_zero(task->storage.XtWY);
 
@@ -609,8 +615,10 @@ public:
 
 	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
 	                              device_ptr mean_ptr, device_ptr variance_ptr,
-	                              int r, int4 rect, DenoisingTask * /*task*/)
+	                              int r, int4 rect, DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
+
 		for(int y = rect.y; y < rect.w; y++) {
 			for(int x = rect.x; x < rect.z; x++) {
 				filter_combine_halves_kernel()(x, y,
@@ -629,6 +637,8 @@ public:
 	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
 	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
+
 		for(int y = task->rect.y; y < task->rect.w; y++) {
 			for(int x = task->rect.x; x < task->rect.z; x++) {
 				filter_divide_shadow_kernel()(task->render_buffer.samples,
@@ -653,6 +663,8 @@ public:
 	                           device_ptr variance_ptr,
 	                           DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
+
 		for(int y = task->rect.y; y < task->rect.w; y++) {
 			for(int x = task->rect.x; x < task->rect.z; x++) {
 				filter_get_feature_kernel()(task->render_buffer.samples,
@@ -676,6 +688,8 @@ public:
 	                               device_ptr output_ptr,
 	                               DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
+
 		for(int y = task->rect.y; y < task->rect.w; y++) {
 			for(int x = task->rect.x; x < task->rect.z; x++) {
 				filter_detect_outliers_kernel()(x, y,
@@ -735,6 +749,8 @@ public:
 
 	void denoise(DenoisingTask& denoising, RenderTile &tile)
 	{
+		ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
+
 		tile.sample = tile.start_sample + tile.num_samples;
 
 		denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
@@ -765,6 +781,8 @@ public:
 
 		KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
 
+		stats.profiler.add_state(&kg->profiler);
+
 		CPUSplitKernel *split_kernel = NULL;
 		if(use_split_kernel) {
 			split_kernel = new CPUSplitKernel(this);
@@ -778,6 +796,7 @@ public:
 
 		RenderTile tile;
 		DenoisingTask denoising(this, task);
+		denoising.profiler = &kg->profiler;
 
 		while(task.acquire_tile(this, tile)) {
 			if(tile.task == RenderTile::PATH_TRACE) {
@@ -802,6 +821,8 @@ public:
 			}
 		}
 
+		stats.profiler.remove_state(&kg->profiler);
+
 		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
 		kg->~KernelGlobals();
 		kgbuffer.free();
@@ -1061,6 +1082,7 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	info.has_volume_decoupled = true;
 	info.has_osl = true;
 	info.has_half_images = true;
+	info.has_profiling = true;
 
 	devices.insert(devices.begin(), info);
 }
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 78c65a3d22d..433cbd3c265 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -22,6 +22,7 @@ CCL_NAMESPACE_BEGIN
 
 DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
 : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
+  profiler(NULL),
   storage(device),
   buffer(device),
   device(device)
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 8e0666d0e59..beae60c220f 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -23,6 +23,8 @@
 
 #include "kernel/filter/filter_defines.h"
 
+#include "util/util_profiling.h"
+
 CCL_NAMESPACE_BEGIN
 
 class DenoisingTask {
@@ -51,6 +53,8 @@ public:
 	TileInfo *tile_info;
 	device_vector<int> tile_info_mem;
 
+	ProfilingState *profiler;
+
 	int4 rect;
 	int4 filter_area;
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 92cb66bdec9..d4145225b77 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -110,6 +110,7 @@ set(SRC_HEADERS
 	kernel_path_surface.h
 	kernel_path_subsurface.h
 	kernel_path_volume.h
+	kernel_profiling.h
 	kernel_projection.h
 	kernel_queues.h
 	kernel_random.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 6708a3efac1..284b1e9208c 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -186,6 +186,8 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
                                           float difl,
                                           float extmax)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT);
+
 	if(!scene_intersect_valid(&ray)) {
 		return false;
 	}
@@ -248,6 +250,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
                                                 uint *lcg_state,
                                                 int max_hits)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
+
 	if(!scene_intersect_valid(&ray)) {
 		return false;
 	}
@@ -327,6 +331,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
+
 	if(!scene_intersect_valid(ray)) {
 		return false;
 	}
@@ -407,6 +413,8 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
                                                  Intersection *isect,
                                                  const uint visibility)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME);
+
 	if(!scene_intersect_valid(ray)) {
 		return false;
 	}
@@ -438,6 +446,8 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
                                                      const uint max_hits,
                                                      const uint visibility)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL);
+
 	if(!scene_intersect_valid(ray)) {
 		return false;
 	}
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 37402f42863..59f1e252d21 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -19,6 +19,8 @@
 #ifndef __KERNEL_GLOBALS_H__
 #define __KERNEL_GLOBALS_H__
 
+#include "kernel/kernel_profiling.h"
+
 #ifdef __KERNEL_CPU__
 #  include "util/util_vector.h"
 #  include "util/util_map.h"
@@ -82,6 +84,8 @@ typedef struct KernelGlobals {
 
 	int2 global_size;
 	int2 global_id;
+
+	ProfilingState profiler;
 } KernelGlobals;
 
 #endif  /* __KERNEL_CPU__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 1f5929e4938..08e9db05c39 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -376,6 +376,9 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg,
                                            int sample,
                                            PathRadiance *L)
 {
+	PROFILING_INIT(kg, PROFILING_WRITE_RESULT);
+	PROFILING_OBJECT(PRIM_NONE);
+
 	float alpha;
 	float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
 
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index cb1f410b09f..a1fc6028293 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -57,6 +57,8 @@ ccl_device_forceinline bool kernel_path_scene_intersect(
 	Intersection *isect,
 	PathRadiance *L)
 {
+	PROFILING_INIT(kg, PROFILING_SCENE_INTERSECT);
+
 	uint visibility = path_state_ray_visibility(kg, state);
 
 	if(path_state_ao_bounce(kg, state)) {
@@ -105,6 +107,8 @@ ccl_device_forceinline void kernel_path_lamp_emission(
 	ShaderData *emission_sd,
 	PathRadiance *L)
 {
+	PROFILING_INIT(kg, PROFILING_INDIRECT_EMISSION);
+
 #ifdef __LAMP_MIS__
 	if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
 		/* ray starting from previous non-transparent bounce */
@@ -172,6 +176,8 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
 	ShaderData *emission_sd,
 	PathRadiance *L)
 {
+	PROFILING_INIT(kg, PROFILING_VOLUME);
+
 	/* Sanitize volume stack. */
 	if(!hit) {
 		kernel_volume_clean_stack(kg, state->volume_stack);
@@ -278,6 +284,8 @@ ccl_device_forceinline bool kernel_path_shader_apply(
 	PathRadiance *L,
 	ccl_global float *buffer)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_APPLY);
+
 #ifdef __SHADOW_TRICKS__
 	if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
 		if(state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) {
@@ -355,6 +363,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         float3 throughput,
                                         float3 ao_alpha)
 {
+	PROFILING_INIT(kg, PROFILING_AO);
+
 	/* todo: solve correlation */
 	float bsdf_u, bsdf_v;
 
@@ -568,6 +578,8 @@ ccl_device_forceinline void kernel_path_integrate(
 	ccl_global float *buffer,
 	ShaderData *emission_sd)
 {
+	PROFILING_INIT(kg, PROFILING_PATH_INTEGRATE);
+
 	/* Shader data memory used for both volumes and surfaces, saves stack space. */
 	ShaderData sd;
 
@@ -719,6 +731,8 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	ccl_global float *buffer,
 	int sample, int x, int y, int offset, int stride)
 {
+	PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
 	/* buffer offset */
 	int index = offset + x + y*stride;
 	int pass_stride = kernel_data.film.pass_stride;
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
index ff4a8a9d580..962776f21c1 100644
--- a/intern/cycles/kernel/kernel_path_subsurface.h
+++ b/intern/cycles/kernel/kernel_path_subsurface.h
@@ -32,6 +32,8 @@ bool kernel_path_subsurface_scatter(
         ccl_addr_space float3 *throughput,
         ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
 {
+	PROFILING_INIT(kg, PROFILING_SUBSURFACE);
+
 	float bssrdf_u, bssrdf_v;
 	path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index c9202ccb16d..0d18a1e8c77 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -217,6 +217,8 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
 	ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state,
 	PathRadiance *L)
 {
+	PROFILING_INIT(kg, PROFILING_CONNECT_LIGHT);
+
 #ifdef __EMISSION__
 	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
 		return;
@@ -274,6 +276,8 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
                                            PathRadianceState *L_state,
                                            ccl_addr_space Ray *ray)
 {
+	PROFILING_INIT(kg, PROFILING_SURFACE_BOUNCE);
+
 	/* no BSDF? we can stop here */
 	if(sd->flag & SD_BSDF) {
 		/* sample BSDF */
diff --git a/intern/cycles/kernel/kernel_profiling.h b/intern/cycles/kernel/kernel_profiling.h
new file mode 100644
index 00000000000..a46d6376473
--- /dev/null
+++ b/intern/cycles/kernel/kernel_profiling.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_PROFILING_H__
+#define __KERNEL_PROFILING_H__
+
+#ifdef __KERNEL_CPU__
+#  include "util/util_profiling.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_CPU__
+#  define PROFILING_INIT(kg, event) ProfilingHelper profiling_helper(&kg->profiler, event)
+#  define PROFILING_EVENT(event) profiling_helper.set_event(event)
+#  define PROFILING_SHADER(shader) if((shader) != SHADER_NONE) { profiling_helper.set_shader((shader) & SHADER_MASK); }
+#  define PROFILING_OBJECT(object) if((object) != PRIM_NONE) { profiling_helper.set_object(object); }
+#else
+#  define PROFILING_INIT(kg, event)
+#  define PROFILING_EVENT(event)
+#  define PROFILING_SHADER(shader)
+#  define PROFILING_OBJECT(object)
+#endif  /* __KERNEL_CPU__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_PROFILING_H__ */
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index af883aa715b..4b2e675bb21 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -54,6 +54,8 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
                                                const Intersection *isect,
                                                const Ray *ray)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 #ifdef __INSTANCING__
 	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
@@ -147,6 +149,9 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 	differential_incoming(&sd->dI, ray->dD);
 	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 #endif
+
+	PROFILING_SHADER(sd->shader);
+	PROFILING_OBJECT(sd->object);
 }
 
 /* ShaderData setup from BSSRDF scatter */
@@ -163,6 +168,8 @@ void shader_setup_from_subsurface(
         const Intersection *isect,
         const Ray *ray)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 	const bool backfacing = sd->flag & SD_BACKFACING;
 
 	/* object, matrices, time, ray_length stay the same */
@@ -233,6 +240,8 @@ void shader_setup_from_subsurface(
 	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 	/* don't modify dP and dI */
 #  endif
+
+	PROFILING_SHADER(sd->shader);
 }
 #endif
 
@@ -249,6 +258,8 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
                                                 bool object_space,
                                                 int lamp)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 	/* vectors */
 	sd->P = P;
 	sd->N = Ng;
@@ -353,6 +364,9 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
 	sd->du = differential_zero();
 	sd->dv = differential_zero();
 #endif
+
+	PROFILING_SHADER(sd->shader);
+	PROFILING_OBJECT(sd->object);
 }
 
 /* ShaderData setup for displacement */
@@ -380,6 +394,8 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 	/* vectors */
 	sd->P = ray->D;
 	sd->N = -ray->D;
@@ -414,6 +430,9 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
 	sd->du = differential_zero();
 	sd->dv = differential_zero();
 #endif
+
+	PROFILING_SHADER(sd->shader);
+	PROFILING_OBJECT(sd->object);
 }
 
 /* ShaderData setup from point inside volume */
@@ -421,6 +440,8 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
 #ifdef __VOLUME__
 ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
+
 	/* vectors */
 	sd->P = ray->P;
 	sd->N = -ray->D;
@@ -461,6 +482,9 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 	/* for NDC coordinates */
 	sd->ray_P = ray->P;
 	sd->ray_dP = ray->dP;
+
+	PROFILING_SHADER(sd->shader);
+	PROFILING_OBJECT(sd->object);
 }
 #endif  /* __VOLUME__ */
 
@@ -591,6 +615,8 @@ void shader_bsdf_eval(KernelGlobals *kg,
                       float light_pdf,
                       bool use_mis)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_EVAL);
+
 	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
 
 #ifdef __BRANCHED_PATH__
@@ -720,6 +746,8 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
                                          differential3 *domega_in,
                                          float *pdf)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+
 	const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
 	if(sc == NULL) {
 		*pdf = 0.0f;
@@ -751,6 +779,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd,
 	const ShaderClosure *sc, float randu, float randv, BsdfEval *bsdf_eval,
 	float3 *omega_in, differential3 *domega_in, float *pdf)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+
 	int label;
 	float3 eval;
 
@@ -984,6 +1014,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 	ccl_addr_space PathState *state, int path_flag)
 {
+	PROFILING_INIT(kg, PROFILING_SHADER_EVAL);
+
 	/* If path is being terminated, we are tracing a shadow ray or evaluating
 	 * emission, then we don't need to store closures. The emission and shadow
 	 * shader data also do not have a closure array to save GPU memory. */
@@ -1084,6 +1116,8 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
 ccl_device void shader_volume_phase_eval(KernelGlobals *kg, const ShaderData *sd,
 	const float3 omega_in, BsdfEval *eval, float *pdf)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_EVAL);
+
 	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
 
 	_shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f);
@@ -1093,6 +1127,8 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s
 	float randu, float randv, BsdfEval *phase_eval,
 	float3 *omega_in, differential3 *domega_in, float *pdf)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
+
 	int sampled = 0;
 
 	if(sd->num_closure > 1) {
@@ -1151,6 +1187,8 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData *
 	const ShaderClosure *sc, float randu, float randv, BsdfEval *phase_eval,
 	float3 *omega_in, differential3 *domega_in, float *pdf)
 {
+	PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
+
 	int label;
 	float3 eval;
 
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index dc7a1043208..b34d16c438b 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -335,6 +335,11 @@ uint Object::visibility_for_tracing() const {
 	return trace_visibility;
 }
 
+int Object::get_device_index() const
+{
+	return index;
+}
+
 /* Object Manager */
 
 ObjectManager::ObjectManager()
@@ -348,10 +353,9 @@ ObjectManager::~ObjectManager()
 }
 
 void ObjectManager::device_update_object_transform(UpdateObjectTransformState *state,
-                                                   Object *ob,
-                                                   int object_index)
+                                                   Object *ob)
 {
-	KernelObject& kobject = state->objects[object_index];
+	KernelObject& kobject = state->objects[ob->index];
 	Transform *object_motion_pass = state->object_motion_pass;
 
 	Mesh *mesh = ob->mesh;
@@ -457,13 +461,13 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
 			tfm_post = tfm_post * itfm;
 		}
 
-		int motion_pass_offset = object_index*OBJECT_MOTION_PASS_SIZE;
+		int motion_pass_offset = ob->index*OBJECT_MOTION_PASS_SIZE;
 		object_motion_pass[motion_pass_offset + 0] = tfm_pre;
 		object_motion_pass[motion_pass_offset + 1] = tfm_post;
 	}
 	else if(state->need_motion == Scene::MOTION_BLUR) {
 		if(ob->use_motion()) {
-			kobject.motion_offset = state->motion_offset[object_index];
+			kobject.motion_offset = state->motion_offset[ob->index];
 
 			/* Decompose transforms for interpolation. */
 			DecomposedTransform *decomp = state->object_motion + kobject.motion_offset;
@@ -494,7 +498,7 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
 	if(ob->use_holdout) {
 		flag |= SD_OBJECT_HOLDOUT_MASK;
 	}
-	state->object_flag[object_index] = flag;
+	state->object_flag[ob->index] = flag;
 
 	/* Have curves. */
 	if(mesh->num_curves()) {
@@ -538,7 +542,7 @@ void ObjectManager::device_update_object_transform_task(
 		for(int i = 0; i < num_objects; ++i) {
 			const int object_index = start_index + i;
 			Object *ob = state->scene->objects[object_index];
-			device_update_object_transform(state, ob, object_index);
+			device_update_object_transform(state, ob);
 		}
 	}
 }
@@ -593,10 +597,8 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene,
 	 * need some tweaks to make mid-complex scenes optimal.
 	 */
 	if(scene->objects.size() < 64) {
-		int object_index = 0;
 		foreach(Object *ob, scene->objects) {
-			device_update_object_transform(&state, ob, object_index);
-			object_index++;
+			device_update_object_transform(&state, ob);
 			if(progress.get_cancel()) {
 				return;
 			}
@@ -642,6 +644,12 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc
 	if(scene->objects.size() == 0)
 		return;
 
+	/* Assign object IDs. */
+	int index = 0;
+	foreach(Object *object, scene->objects) {
+		object->index = index++;
+	}
+
 	/* set object transform matrices, before applying static transforms */
 	progress.set_status("Updating Objects", "Copying Transformations to device");
 	device_update_transforms(dscene, scene, progress);
@@ -686,26 +694,25 @@ void ObjectManager::device_update_flags(Device *,
 		}
 	}
 
-	int object_index = 0;
 	foreach(Object *object, scene->objects) {
 		if(object->mesh->has_volume) {
-			object_flag[object_index] |= SD_OBJECT_HAS_VOLUME;
-			object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
+			object_flag[object->index] |= SD_OBJECT_HAS_VOLUME;
+			object_flag[object->index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
 
 			foreach(Attribute& attr, object->mesh->attributes.attributes) {
 				if(attr.element == ATTR_ELEMENT_VOXEL) {
-					object_flag[object_index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
+					object_flag[object->index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES;
 				}
 			}
 		}
 		else {
-			object_flag[object_index] &= ~(SD_OBJECT_HAS_VOLUME|SD_OBJECT_HAS_VOLUME_ATTRIBUTES);
+			object_flag[object->index] &= ~(SD_OBJECT_HAS_VOLUME|SD_OBJECT_HAS_VOLUME_ATTRIBUTES);
 		}
 		if(object->is_shadow_catcher) {
-			object_flag[object_index] |= SD_OBJECT_SHADOW_CATCHER;
+			object_flag[object->index] |= SD_OBJECT_SHADOW_CATCHER;
 		}
 		else {
-			object_flag[object_index] &= ~SD_OBJECT_SHADOW_CATCHER;
+			object_flag[object->index] &= ~SD_OBJECT_SHADOW_CATCHER;
 		}
 
 		if(bounds_valid) {
@@ -714,7 +721,7 @@ void ObjectManager::device_update_flags(Device *,
 					continue;
 				}
 				if(object->bounds.intersects(volume_object->bounds)) {
-					object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+					object_flag[object->index] |= SD_OBJECT_INTERSECTS_VOLUME;
 					break;
 				}
 			}
@@ -723,9 +730,8 @@ void ObjectManager::device_update_flags(Device *,
 			/* Not really valid, but can't make more reliable in the case
 			 * of bounds not being up to date.
 			 */
-			object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+			object_flag[object->index] |= SD_OBJECT_INTERSECTS_VOLUME;
 		}
-		++object_index;
 	}
 
 	/* Copy object flag. */
@@ -741,7 +747,6 @@ void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Sc
 	KernelObject *kobjects = dscene->objects.data();
 
 	bool update = false;
-	int object_index = 0;
 
 	foreach(Object *object, scene->objects) {
 		Mesh* mesh = object->mesh;
@@ -750,18 +755,16 @@ void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Sc
 			uint patch_map_offset = 2*(mesh->patch_table_offset + mesh->patch_table->total_size() -
 			                           mesh->patch_table->num_nodes * PATCH_NODE_SIZE) - mesh->patch_offset;
 
-			if(kobjects[object_index].patch_map_offset != patch_map_offset) {
-				kobjects[object_index].patch_map_offset = patch_map_offset;
+			if(kobjects[object->index].patch_map_offset != patch_map_offset) {
+				kobjects[object->index].patch_map_offset = patch_map_offset;
 				update = true;
 			}
 		}
 
-		if(kobjects[object_index].attribute_map_offset != mesh->attr_map_offset) {
-			kobjects[object_index].attribute_map_offset = mesh->attr_map_offset;
+		if(kobjects[object->index].attribute_map_offset != mesh->attr_map_offset) {
+			kobjects[object->index].attribute_map_offset = mesh->attr_map_offset;
 			update = true;
 		}
-
-		object_index++;
 	}
 
 	if(update) {
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 87e6e6652ad..134f0bc3577 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -38,6 +38,7 @@ class Progress;
 class Scene;
 struct Transform;
 struct UpdateObjectTransformState;
+class ObjectManager;
 
 /* Object */
 
@@ -88,6 +89,16 @@ public:
 	 * determined flags which denotes trace-time visibility.
 	 */
 	uint visibility_for_tracing() const;
+
+	/* Returns the index that is used in the kernel for this object. */
+	int get_device_index() const;
+
+protected:
+	/* Specifies the position of the object in scene->objects and
+	 * in the device vectors. Gets set in device_update. */
+	int index;
+
+	friend class ObjectManager;
 };
 
 /* Object Manager */
@@ -123,8 +134,7 @@ public:
 
 protected:
 	void device_update_object_transform(UpdateObjectTransformState *state,
-	                                    Object *ob,
-	                                    const int object_index);
+	                                    Object *ob);
 	void device_update_object_transform_task(UpdateObjectTransformState *state);
 	bool device_update_object_transform_pop_work(
 	        UpdateObjectTransformState *state,
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index d6ecafa19b7..a8374b662d0 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -250,7 +250,9 @@ void Session::run_gpu()
 		if(!no_tiles) {
 			/* update scene */
 			scoped_timer update_timer;
-			update_scene();
+			if(update_scene()) {
+				stats.profiler.reset(scene->shaders.size(), scene->objects.size());
+			}
 			progress.add_skip_time(update_timer, params.background);
 
 			if(!device->error_message().empty())
@@ -585,7 +587,9 @@ void Session::run_cpu()
 
 			/* update scene */
 			scoped_timer update_timer;
-			update_scene();
+			if(update_scene()) {
+				stats.profiler.reset(scene->shaders.size(), scene->objects.size());
+			}
 			progress.add_skip_time(update_timer, params.background);
 
 			if(!device->error_message().empty())
@@ -729,6 +733,10 @@ void Session::run()
 	/* load kernels */
 	load_kernels();
 
+	if(params.use_profiling && (params.device.type == DEVICE_CPU)) {
+		stats.profiler.start();
+	}
+
 	/* session thread loop */
 	progress.set_status("Waiting for render to start");
 
@@ -743,6 +751,8 @@ void Session::run()
 			run_cpu();
 	}
 
+	stats.profiler.stop();
+
 	/* progress update */
 	if(progress.get_cancel())
 		progress.set_status("Cancel", progress.get_cancel_message());
@@ -825,7 +835,7 @@ void Session::wait()
 	session_thread = NULL;
 }
 
-void Session::update_scene()
+bool Session::update_scene()
 {
 	thread_scoped_lock scene_lock(scene->mutex);
 
@@ -876,7 +886,10 @@ void Session::update_scene()
 
 		progress.set_status("Updating Scene");
 		MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
+
+		return true;
 	}
+	return false;
 }
 
 void Session::update_status_time(bool show_pause, bool show_done)
@@ -1052,6 +1065,14 @@ void Session::device_free()
 	 */
 }
 
+void Session::collect_statistics(RenderStats *render_stats)
+{
+	scene->collect_statistics(render_stats);
+	if(params.use_profiling && (params.device.type == DEVICE_CPU)) {
+		render_stats->collect_profiling(scene, &stats);
+	}
+}
+
 int Session::get_max_closure_count()
 {
 	int max_closures = 0;
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 56a69919a7a..db07c758427 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -20,6 +20,7 @@
 #include "render/buffers.h"
 #include "device/device.h"
 #include "render/shader.h"
+#include "render/stats.h"
 #include "render/tile.h"
 
 #include "util/util_progress.h"
@@ -55,6 +56,8 @@ public:
 	int pixel_size;
 	int threads;
 
+	bool use_profiling;
+
 	bool display_buffer_linear;
 
 	bool use_denoising;
@@ -89,6 +92,8 @@ public:
 		pixel_size = 1;
 		threads = 0;
 
+		use_profiling = false;
+
 		use_denoising = false;
 		denoising_passes = false;
 		denoising_radius = 8;
@@ -118,6 +123,7 @@ public:
 		&& start_resolution == params.start_resolution
 		&& pixel_size == params.pixel_size
 		&& threads == params.threads
+		&& use_profiling == params.use_profiling
 		&& display_buffer_linear == params.display_buffer_linear
 		&& cancel_timeout == params.cancel_timeout
 		&& reset_timeout == params.reset_timeout
@@ -159,7 +165,7 @@ public:
 	void set_samples(int samples);
 	void set_pause(bool pause);
 
-	void update_scene();
+	bool update_scene();
 	void load_kernels(bool lock_scene=true);
 
 	void device_free();
@@ -168,6 +174,8 @@ public:
 	 * (for example, when rendering with unlimited samples). */
 	float get_progress();
 
+	void collect_statistics(RenderStats *stats);
+
 protected:
 	struct DelayedReset {
 		thread_mutex mutex;
diff --git a/intern/cycles/render/stats.cpp b/intern/cycles/render/stats.cpp
index 101d33fcf65..71d9353ad3d 100644
--- a/intern/cycles/render/stats.cpp
+++ b/intern/cycles/render/stats.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "render/stats.h"
+#include "render/object.h"
 #include "util/util_algorithm.h"
 #include "util/util_foreach.h"
 #include "util/util_string.h"
@@ -33,6 +34,16 @@ bool namedSizeEntryComparator(const NamedSizeEntry& a, const NamedSizeEntry& b)
 	return a.size > b.size;
 }
 
+bool namedTimeSampleEntryComparator(const NamedNestedSampleStats& a, const NamedNestedSampleStats& b)
+{
+	return a.sum_samples > b.sum_samples;
+}
+
+bool namedSampleCountPairComparator(const NamedSampleCountPair& a, const NamedSampleCountPair& b)
+{
+	return a.samples > b.samples;
+}
+
 }  // namespace
 
 NamedSizeEntry::NamedSizeEntry()
@@ -77,6 +88,113 @@ string NamedSizeStats::full_report(int indent_level)
 	return result;
 }
 
+/* Named time sample statistics. */
+
+NamedNestedSampleStats::NamedNestedSampleStats()
+ : name(""), self_samples(0), sum_samples(0)
+{}
+
+NamedNestedSampleStats::NamedNestedSampleStats(const string& name, uint64_t samples)
+ : name(name), self_samples(samples), sum_samples(samples)
+{}
+
+NamedNestedSampleStats& NamedNestedSampleStats::add_entry(const string& name_, uint64_t samples_)
+{
+	entries.push_back(NamedNestedSampleStats(name_, samples_));
+	return entries[entries.size()-1];
+}
+
+void NamedNestedSampleStats::update_sum()
+{
+	sum_samples = self_samples;
+	foreach(NamedNestedSampleStats& entry, entries) {
+		entry.update_sum();
+		sum_samples += entry.sum_samples;
+	}
+}
+
+string NamedNestedSampleStats::full_report(int indent_level, uint64_t total_samples)
+{
+	update_sum();
+
+	if(total_samples == 0) {
+		total_samples = sum_samples;
+	}
+
+	const string indent(indent_level * kIndentNumSpaces, ' ');
+
+	const double sum_percent = 100*((double) sum_samples) / total_samples;
+	const double sum_seconds = sum_samples * 0.001;
+	const double self_percent = 100*((double) self_samples) / total_samples;
+	const double self_seconds = self_samples * 0.001;
+	string info = string_printf("%-32s: Total %3.2f%% (%.2fs), Self %3.2f%% (%.2fs)\n",
+	                            name.c_str(),
+	                            sum_percent,
+	                            sum_seconds,
+	                            self_percent,
+	                            self_seconds);
+	string result = indent + info; 
+
+	sort(entries.begin(), entries.end(), namedTimeSampleEntryComparator);
+	foreach(NamedNestedSampleStats& entry, entries) {
+		result += entry.full_report(indent_level + 1, total_samples);
+	}
+	return result;
+}
+
+/* Named sample count pairs. */
+
+NamedSampleCountPair::NamedSampleCountPair(const ustring& name, uint64_t samples, uint64_t hits)
+ : name(name), samples(samples), hits(hits)
+{}
+
+NamedSampleCountStats::NamedSampleCountStats()
+{}
+
+void NamedSampleCountStats::add(const ustring& name, uint64_t samples, uint64_t hits)
+{
+	entry_map::iterator entry = entries.find(name);
+	if(entry != entries.end()) {
+		entry->second.samples += samples;
+		entry->second.hits += hits;
+		return;
+	}
+	entries.emplace(name, NamedSampleCountPair(name, samples, hits));
+}
+
+string NamedSampleCountStats::full_report(int indent_level)
+{
+	const string indent(indent_level * kIndentNumSpaces, ' ');
+
+	vector<NamedSampleCountPair> sorted_entries;
+	sorted_entries.reserve(entries.size());
+
+	uint64_t total_hits = 0, total_samples = 0;
+	foreach(entry_map::const_reference entry, entries) {
+		const NamedSampleCountPair &pair = entry.second;
+
+		total_hits += pair.hits;
+		total_samples += pair.samples;
+
+		sorted_entries.push_back(pair);
+	}
+	const double avg_samples_per_hit = ((double) total_samples) / total_hits;
+
+	sort(sorted_entries.begin(), sorted_entries.end(), namedSampleCountPairComparator);
+
+	string result = "";
+	foreach(const NamedSampleCountPair& entry, sorted_entries) {
+		const double seconds = entry.samples * 0.001;
+		const double relative = ((double) entry.samples) / (entry.hits * avg_samples_per_hit);
+
+		result += indent + string_printf("%-32s: %.2fs (Relative cost: %.2f)\n",
+		                                 entry.name.c_str(),
+		                                 seconds,
+		                                 relative);
+	}
+	return result;
+}
+
 /* Mesh statistics. */
 
 MeshStats::MeshStats() {
@@ -106,6 +224,74 @@ string ImageStats::full_report(int indent_level)
 /* Overall statistics. */
 
 RenderStats::RenderStats() {
+	has_profiling = false;
+}
+
+void RenderStats::collect_profiling(Scene *scene, Stats *stats)
+{
+	has_profiling = true;
+
+	Profiler &prof = stats->profiler;
+
+	kernel = NamedNestedSampleStats("Total render time", prof.get_event(PROFILING_UNKNOWN));
+
+	kernel.add_entry("Ray setup", prof.get_event(PROFILING_RAY_SETUP));
+	kernel.add_entry("Result writing", prof.get_event(PROFILING_WRITE_RESULT));
+
+	NamedNestedSampleStats &integrator = kernel.add_entry("Path integration", prof.get_event(PROFILING_PATH_INTEGRATE));
+	integrator.add_entry("Scene intersection", prof.get_event(PROFILING_SCENE_INTERSECT));
+	integrator.add_entry("Indirect emission", prof.get_event(PROFILING_INDIRECT_EMISSION));
+	integrator.add_entry("Volumes", prof.get_event(PROFILING_VOLUME));
+
+	NamedNestedSampleStats &shading = integrator.add_entry("Shading", 0);
+	shading.add_entry("Shader Setup", prof.get_event(PROFILING_SHADER_SETUP));
+	shading.add_entry("Shader Eval", prof.get_event(PROFILING_SHADER_EVAL));
+	shading.add_entry("Shader Apply", prof.get_event(PROFILING_SHADER_APPLY));
+	shading.add_entry("Ambient Occlusion", prof.get_event(PROFILING_AO));
+	shading.add_entry("Subsurface", prof.get_event(PROFILING_SUBSURFACE));
+
+	integrator.add_entry("Connect Light", prof.get_event(PROFILING_CONNECT_LIGHT));
+	integrator.add_entry("Surface Bounce", prof.get_event(PROFILING_SURFACE_BOUNCE));
+
+	NamedNestedSampleStats &intersection = kernel.add_entry("Intersection", 0);
+	intersection.add_entry("Full Intersection", prof.get_event(PROFILING_INTERSECT));
+	intersection.add_entry("Local Intersection", prof.get_event(PROFILING_INTERSECT_LOCAL));
+	intersection.add_entry("Shadow All Intersection", prof.get_event(PROFILING_INTERSECT_SHADOW_ALL));
+	intersection.add_entry("Volume Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME));
+	intersection.add_entry("Volume All Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME_ALL));
+
+	NamedNestedSampleStats &closure = kernel.add_entry("Closures", 0);
+	closure.add_entry("Surface Closure Evaluation", prof.get_event(PROFILING_CLOSURE_EVAL));
+	closure.add_entry("Surface Closure Sampling", prof.get_event(PROFILING_CLOSURE_SAMPLE));
+	closure.add_entry("Volume Closure Evaluation", prof.get_event(PROFILING_CLOSURE_VOLUME_EVAL));
+	closure.add_entry("Volume Closure Sampling", prof.get_event(PROFILING_CLOSURE_VOLUME_SAMPLE));
+
+	NamedNestedSampleStats &denoising = kernel.add_entry("Denoising", prof.get_event(PROFILING_DENOISING));
+	denoising.add_entry("Construct Transform", prof.get_event(PROFILING_DENOISING_CONSTRUCT_TRANSFORM));
+	denoising.add_entry("Reconstruct", prof.get_event(PROFILING_DENOISING_RECONSTRUCT));
+
+	NamedNestedSampleStats &prefilter = denoising.add_entry("Prefiltering", 0);
+	prefilter.add_entry("Divide Shadow", prof.get_event(PROFILING_DENOISING_DIVIDE_SHADOW));
+	prefilter.add_entry("Non-Local means", prof.get_event(PROFILING_DENOISING_NON_LOCAL_MEANS));
+	prefilter.add_entry("Get Feature", prof.get_event(PROFILING_DENOISING_GET_FEATURE));
+	prefilter.add_entry("Detect Outliers", prof.get_event(PROFILING_DENOISING_DETECT_OUTLIERS));
+	prefilter.add_entry("Combine Halves", prof.get_event(PROFILING_DENOISING_COMBINE_HALVES));
+
+	shaders.entries.clear();
+	foreach(Shader *shader, scene->shaders) {
+		uint64_t samples, hits;
+		if(prof.get_shader(shader->id, samples, hits)) {
+			shaders.add(shader->name, samples, hits);
+		}
+	}
+
+	objects.entries.clear();
+	foreach(Object *object, scene->objects) {
+		uint64_t samples, hits;
+		if(prof.get_object(object->get_device_index(), samples, hits)) {
+			objects.add(object->name, samples, hits);
+		}
+	}
 }
 
 string RenderStats::full_report()
@@ -113,6 +299,14 @@ string RenderStats::full_report()
 	string result = "";
 	result += "Mesh statistics:\n" + mesh.full_report(1);
 	result += "Image statistics:\n" + image.full_report(1);
+	if(has_profiling) {
+		result += "Kernel statistics:\n" + kernel.full_report(1);
+		result += "Shader statistics:\n" + shaders.full_report(1);
+		result += "Object statistics:\n" + objects.full_report(1);
+	}
+	else {
+		result += "Profiling information not available (only works with CPU rendering)";
+	}
 	return result;
 }
 
diff --git a/intern/cycles/render/stats.h b/intern/cycles/render/stats.h
index 2ff0ec3e0e9..ce0529bb8bd 100644
--- a/intern/cycles/render/stats.h
+++ b/intern/cycles/render/stats.h
@@ -17,6 +17,9 @@
 #ifndef __RENDER_STATS_H__
 #define __RENDER_STATS_H__
 
+#include "render/scene.h"
+
+#include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_vector.h"
 
@@ -61,6 +64,51 @@ public:
 	vector<NamedSizeEntry> entries;
 };
 
+class NamedNestedSampleStats {
+public:
+	NamedNestedSampleStats();
+	NamedNestedSampleStats(const string& name, uint64_t samples);
+
+	NamedNestedSampleStats& add_entry(const string& name, uint64_t samples);
+
+	/* Updates sum_samples recursively. */
+	void update_sum();
+
+	string full_report(int indent_level = 0, uint64_t total_samples = 0);
+
+	string name;
+
+	/* self_samples contains only the samples that this specific event got,
+	 * while sum_samples also includes the samples of all sub-entries. */
+	uint64_t self_samples, sum_samples;
+
+	vector<NamedNestedSampleStats> entries;
+};
+
+/* Named entry containing both a time-sample count for objects of a type and a
+ * total count of processed items.
+ * This allows to estimate the time spent per item. */
+class NamedSampleCountPair {
+public:
+	NamedSampleCountPair(const ustring& name, uint64_t samples, uint64_t hits);
+
+	ustring name;
+	uint64_t samples;
+	uint64_t hits;
+};
+
+/* Contains statistics about pairs of samples and counts as described above. */
+class NamedSampleCountStats {
+public:
+	NamedSampleCountStats();
+
+	string full_report(int indent_level = 0);
+	void add(const ustring& name, uint64_t samples, uint64_t hits);
+
+	typedef unordered_map<ustring, NamedSampleCountPair, ustringHash> entry_map;
+	entry_map entries;
+};
+
 /* Statistics about mesh in the render database. */
 class MeshStats {
 public:
@@ -95,8 +143,16 @@ public:
 	/* Return full report as string. */
 	string full_report();
 
+	/* Collect kernel sampling information from Stats. */
+	void collect_profiling(Scene *scene, Stats *stats);
+
+	bool has_profiling;
+
 	MeshStats mesh;
 	ImageStats image;
+	NamedNestedSampleStats kernel;
+	NamedSampleCountStats shaders;
+	NamedSampleCountStats objects;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 77d47984ee7..92dfc9fa85d 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -17,6 +17,7 @@ set(SRC
 	util_md5.cpp
 	util_murmurhash.cpp
 	util_path.cpp
+	util_profiling.cpp
 	util_string.cpp
 	util_simd.cpp
 	util_system.cpp
@@ -71,6 +72,7 @@ set(SRC_HEADERS
 	util_optimization.h
 	util_param.h
 	util_path.h
+	util_profiling.h
 	util_progress.h
 	util_projection.h
 	util_queue.h
diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp
new file mode 100644
index 00000000000..30aaef69310
--- /dev/null
+++ b/intern/cycles/util/util_profiling.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_algorithm.h"
+#include "util/util_profiling.h"
+#include "util/util_set.h"
+
+CCL_NAMESPACE_BEGIN
+
+Profiler::Profiler()
+ : do_stop_worker(true), worker(NULL)
+{
+}
+
+Profiler::~Profiler()
+{
+	assert(worker == NULL);
+}
+
+void Profiler::run()
+{
+	uint64_t updates = 0;
+	auto start_time = std::chrono::system_clock::now();
+	while(!do_stop_worker) {
+		thread_scoped_lock lock(mutex);
+		foreach(ProfilingState *state, states) {
+			uint32_t cur_event = state->event;
+			int32_t cur_shader = state->shader;
+			int32_t cur_object = state->object;
+
+			/* The state reads/writes should be atomic, but just to be sure
+			 * check the values for validity anyways. */
+			if(cur_event < PROFILING_NUM_EVENTS) {
+				event_samples[cur_event]++;
+			}
+
+			if(cur_shader >= 0 && cur_shader < shader_samples.size()) {
+				/* Only consider the active shader during events whose runtime significantly depends on it. */
+				if(((cur_event >= PROFILING_SHADER_EVAL ) && (cur_event <= PROFILING_SUBSURFACE)) ||
+				   ((cur_event >= PROFILING_CLOSURE_EVAL) && (cur_event <= PROFILING_CLOSURE_VOLUME_SAMPLE))) {
+					shader_samples[cur_shader]++;
+				}
+			}
+
+			if(cur_object >= 0 && cur_object < object_samples.size()) {
+				object_samples[cur_object]++;
+			}
+		}
+		lock.unlock();
+
+		/* Relative waits always overshoot a bit, so just waiting 1ms every
+		 * time would cause the sampling to drift over time.
+		 * By keeping track of the absolute time, the wait times correct themselves -
+		 * if one wait overshoots a lot, the next one will be shorter to compensate. */
+		updates++;
+		std::this_thread::sleep_until(start_time + updates*std::chrono::milliseconds(1));
+	}
+}
+
+void Profiler::reset(int num_shaders, int num_objects)
+{
+	bool running = (worker != NULL);
+	if(running) {
+		stop();
+	}
+
+	/* Resize and clear the accumulation vectors. */
+	shader_hits.assign(num_shaders, 0);
+	object_hits.assign(num_objects, 0);
+
+	event_samples.assign(PROFILING_NUM_EVENTS, 0);
+	shader_samples.assign(num_shaders, 0);
+	object_samples.assign(num_objects, 0);
+
+	if(running) {
+		start();
+	}
+}
+
+void Profiler::start()
+{
+	assert(worker == NULL);
+	do_stop_worker = false;
+	worker = new thread(function_bind(&Profiler::run, this));
+}
+
+void Profiler::stop()
+{
+	if(worker != NULL) {
+		do_stop_worker = true;
+
+		worker->join();
+		delete worker;
+		worker = NULL;
+	}
+}
+
+void Profiler::add_state(ProfilingState *state)
+{
+	thread_scoped_lock lock(mutex);
+
+	/* Add the ProfilingState from the list of sampled states. */
+	assert(std::find(states.begin(), states.end(), state) == states.end());
+	states.push_back(state);
+
+	/* Resize thread-local hit counters. */
+	state->shader_hits.assign(shader_hits.size(), 0);
+	state->object_hits.assign(object_hits.size(), 0);
+
+	/* Initialize the state. */
+	state->event = PROFILING_UNKNOWN;
+	state->shader = -1;
+	state->object = -1;
+	state->active = true;
+}
+
+void Profiler::remove_state(ProfilingState *state)
+{
+	thread_scoped_lock lock(mutex);
+
+	/* Remove the ProfilingState from the list of sampled states. */
+	states.erase(std::remove(states.begin(), states.end(), state), states.end());
+	state->active = false;
+
+	/* Merge thread-local hit counters. */
+	assert(shader_hits.size() == state->shader_hits.size());
+	for(int i = 0; i < shader_hits.size(); i++) {
+		shader_hits[i] += state->shader_hits[i];
+	}
+
+	assert(object_hits.size() == state->object_hits.size());
+	for(int i = 0; i < object_hits.size(); i++) {
+		object_hits[i] += state->object_hits[i];
+	}
+}
+
+uint64_t Profiler::get_event(ProfilingEvent event)
+{
+	assert(worker == NULL);
+	return event_samples[event];
+}
+
+bool Profiler::get_shader(int shader, uint64_t &samples, uint64_t &hits)
+{
+	assert(worker == NULL);
+	if(shader_samples[shader] == 0) {
+		return false;
+	}
+	samples = shader_samples[shader];
+	hits = shader_hits[shader];
+	return true;
+}
+
+bool Profiler::get_object(int object, uint64_t &samples, uint64_t &hits)
+{
+	assert(worker == NULL);
+	if(object_samples[object] == 0) {
+		return false;
+	}
+	samples = object_samples[object];
+	hits = object_hits[object];
+	return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h
new file mode 100644
index 00000000000..4460402b12d
--- /dev/null
+++ b/intern/cycles/util/util_profiling.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2011-2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_PROFILING_H__
+#define __UTIL_PROFILING_H__
+
+#include <atomic>
+
+#include "util/util_foreach.h"
+#include "util/util_map.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum ccl_try_align(16) ProfilingEvent : uint32_t {
+	PROFILING_UNKNOWN,
+	PROFILING_RAY_SETUP,
+	PROFILING_PATH_INTEGRATE,
+	PROFILING_SCENE_INTERSECT,
+	PROFILING_INDIRECT_EMISSION,
+	PROFILING_VOLUME,
+	PROFILING_SHADER_SETUP,
+	PROFILING_SHADER_EVAL,
+	PROFILING_SHADER_APPLY,
+	PROFILING_AO,
+	PROFILING_SUBSURFACE,
+	PROFILING_CONNECT_LIGHT,
+	PROFILING_SURFACE_BOUNCE,
+	PROFILING_WRITE_RESULT,
+
+	PROFILING_INTERSECT,
+	PROFILING_INTERSECT_LOCAL,
+	PROFILING_INTERSECT_SHADOW_ALL,
+	PROFILING_INTERSECT_VOLUME,
+	PROFILING_INTERSECT_VOLUME_ALL,
+
+	PROFILING_CLOSURE_EVAL,
+	PROFILING_CLOSURE_SAMPLE,
+	PROFILING_CLOSURE_VOLUME_EVAL,
+	PROFILING_CLOSURE_VOLUME_SAMPLE,
+
+	PROFILING_DENOISING,
+	PROFILING_DENOISING_CONSTRUCT_TRANSFORM,
+	PROFILING_DENOISING_RECONSTRUCT,
+	PROFILING_DENOISING_DIVIDE_SHADOW,
+	PROFILING_DENOISING_NON_LOCAL_MEANS,
+	PROFILING_DENOISING_COMBINE_HALVES,
+	PROFILING_DENOISING_GET_FEATURE,
+	PROFILING_DENOISING_DETECT_OUTLIERS,
+
+	PROFILING_NUM_EVENTS,
+};
+
+/* Contains the current execution state of a worker thread.
+ * These values are constantly updated by the worker.
+ * Periodically the profiler thread will wake up, read them
+ * and update its internal counters based on it.
+ *
+ * Atomics aren't needed here since we're only doing direct
+ * writes and reads to (4-byte-aligned) uint32_t, which is
+ * guaranteed to be atomic on x86 since the 486.
+ * Memory ordering is not guaranteed but does not matter.
+ *
+ * And even on other architectures, the extremely rare corner
+ * case of reading an intermediate state could at worst result
+ * in a single incorrect sample. */
+struct ProfilingState {
+	volatile uint32_t event = PROFILING_UNKNOWN;
+	volatile int32_t shader = -1;
+	volatile int32_t object = -1;
+	volatile bool active = false;
+
+	vector<uint64_t> shader_hits;
+	vector<uint64_t> object_hits;
+};
+
+class Profiler {
+public:
+	Profiler();
+	~Profiler();
+
+	void reset(int num_shaders, int num_objects);
+
+	void start();
+	void stop();
+
+	void add_state(ProfilingState *state);
+	void remove_state(ProfilingState *state);
+
+	uint64_t get_event(ProfilingEvent event);
+	bool get_shader(int shader, uint64_t &samples, uint64_t &hits);
+	bool get_object(int object, uint64_t &samples, uint64_t &hits);
+
+protected:
+	void run();
+
+	/* Tracks how often the worker was in each ProfilingEvent while sampling,
+	 * so multiplying the values by the sample frequency (currently 1ms)
+	 * gives the approximate time spent in each state. */
+	vector<uint64_t> event_samples;
+	vector<uint64_t> shader_samples;
+	vector<uint64_t> object_samples;
+
+	/* Tracks the total amounts every object/shader was hit.
+	 * Used to evaluate relative cost, written by the render thread.
+	 * Indexed by the shader and object IDs that the kernel also uses
+	 * to index __object_flag and __shaders. */
+	vector<uint64_t> shader_hits;
+	vector<uint64_t> object_hits;
+
+	volatile bool do_stop_worker;
+	thread *worker;
+
+	thread_mutex mutex;
+	vector<ProfilingState*> states;
+};
+
+class ProfilingHelper {
+public:
+	ProfilingHelper(ProfilingState *state, ProfilingEvent event)
+	 : state(state)
+	{
+		previous_event = state->event;
+		state->event = event;
+	}
+
+	inline void set_event(ProfilingEvent event)
+	{
+		state->event = event;
+	}
+
+	inline void set_shader(int shader)
+	{
+		state->shader = shader;
+		if(state->active) {
+			assert(shader < state->shader_hits.size());
+			state->shader_hits[shader]++;
+		}
+	}
+
+	inline void set_object(int object)
+	{
+		state->object = object;
+		if(state->active) {
+			assert(object < state->object_hits.size());
+			state->object_hits[object]++;
+		}
+	}
+
+	~ProfilingHelper()
+	{
+		state->event = previous_event;
+	}
+private:
+	ProfilingState *state;
+	uint32_t previous_event;
+};
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_PROFILING_H__ */
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index 0ba58422a67..1128bc3ced1 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -18,6 +18,7 @@
 #define __UTIL_STATS_H__
 
 #include "util/util_atomic.h"
+#include "util/util_profiling.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -40,6 +41,8 @@ public:
 
 	size_t mem_used;
 	size_t mem_peak;
+
+	Profiler profiler;
 };
 
 CCL_NAMESPACE_END