23 files changed, 11227 insertions, 11040 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index d804a07bcab..75f4a72bee3 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -1,61 +1,61 @@
 
 set(INC
-	..
-	../../glew-mx
+  ..
+  ../../glew-mx
 )
 
 set(INC_SYS
-	${GLEW_INCLUDE_DIR}
-	../../../extern/clew/include
+  ${GLEW_INCLUDE_DIR}
+  ../../../extern/clew/include
 )
 
 if(WITH_CUDA_DYNLOAD)
-	list(APPEND INC
-		../../../extern/cuew/include
-	)
-	add_definitions(-DWITH_CUDA_DYNLOAD)
+  list(APPEND INC
+    ../../../extern/cuew/include
+  )
+  add_definitions(-DWITH_CUDA_DYNLOAD)
 else()
-	list(APPEND INC_SYS
-		${CUDA_TOOLKIT_INCLUDE}
-	)
-	add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
+  list(APPEND INC_SYS
+    ${CUDA_TOOLKIT_INCLUDE}
+  )
+  add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
 endif()
 
 set(SRC
-	device.cpp
-	device_cpu.cpp
-	device_cuda.cpp
-	device_denoising.cpp
-	device_memory.cpp
-	device_multi.cpp
-	device_opencl.cpp
-	device_split_kernel.cpp
-	device_task.cpp
+  device.cpp
+  device_cpu.cpp
+  device_cuda.cpp
+  device_denoising.cpp
+  device_memory.cpp
+  device_multi.cpp
+  device_opencl.cpp
+  device_split_kernel.cpp
+  device_task.cpp
 )
 
 set(SRC_OPENCL
-	opencl/opencl.h
-	opencl/memory_manager.h
+  opencl/opencl.h
+  opencl/memory_manager.h
 
-	opencl/opencl_split.cpp
-	opencl/opencl_util.cpp
-	opencl/memory_manager.cpp
+  opencl/opencl_split.cpp
+  opencl/opencl_util.cpp
+  opencl/memory_manager.cpp
 )
 
 if(WITH_CYCLES_NETWORK)
-	list(APPEND SRC
-		device_network.cpp
-	)
+  list(APPEND SRC
+    device_network.cpp
+  )
 endif()
 
 set(SRC_HEADERS
-	device.h
-	device_denoising.h
-	device_memory.h
-	device_intern.h
-	device_network.h
-	device_split_kernel.h
-	device_task.h
+  device.h
+  device_denoising.h
+  device_memory.h
+  device_intern.h
+  device_network.h
+  device_split_kernel.h
+  device_task.h
 )
 
 set(LIB
@@ -63,27 +63,27 @@ set(LIB
 )
 
 if(WITH_CUDA_DYNLOAD)
-	list(APPEND LIB
-		extern_cuew
-	)
+  list(APPEND LIB
+    extern_cuew
+  )
 else()
-	list(APPEND LIB
-		${CUDA_CUDA_LIBRARY}
-	)
+  list(APPEND LIB
+    ${CUDA_CUDA_LIBRARY}
+  )
 endif()
 
 add_definitions(${GL_DEFINITIONS})
 if(WITH_CYCLES_NETWORK)
-	add_definitions(-DWITH_NETWORK)
+  add_definitions(-DWITH_NETWORK)
 endif()
 if(WITH_CYCLES_DEVICE_OPENCL)
-	add_definitions(-DWITH_OPENCL)
+  add_definitions(-DWITH_OPENCL)
 endif()
 if(WITH_CYCLES_DEVICE_CUDA)
-	add_definitions(-DWITH_CUDA)
+  add_definitions(-DWITH_CUDA)
 endif()
 if(WITH_CYCLES_DEVICE_MULTI)
-	add_definitions(-DWITH_MULTI)
+  add_definitions(-DWITH_MULTI)
 endif()
 
 include_directories(${INC})
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index e74637472ef..16a68e8b855 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -44,572 +44,577 @@ uint Device::devices_initialized_mask = 0;
 
 /* Device Requested Features */
 
-std::ostream& operator <<(std::ostream &os,
-                          const DeviceRequestedFeatures& requested_features)
+std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features)
 {
-	os << "Experimental features: "
-	   << (requested_features.experimental ? "On" : "Off") << std::endl;
-	os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
-	/* TODO(sergey): Decode bitflag into list of names. */
-	os << "Nodes features: " << requested_features.nodes_features << std::endl;
-	os << "Use Hair: "
-	   << string_from_bool(requested_features.use_hair) << std::endl;
-	os << "Use Object Motion: "
-	   << string_from_bool(requested_features.use_object_motion) << std::endl;
-	os << "Use Camera Motion: "
-	   << string_from_bool(requested_features.use_camera_motion) << std::endl;
-	os << "Use Baking: "
-	   << string_from_bool(requested_features.use_baking) << std::endl;
-	os << "Use Subsurface: "
-	   << string_from_bool(requested_features.use_subsurface) << std::endl;
-	os << "Use Volume: "
-	   << string_from_bool(requested_features.use_volume) << std::endl;
-	os << "Use Branched Integrator: "
-	   << string_from_bool(requested_features.use_integrator_branched) << std::endl;
-	os << "Use Patch Evaluation: "
-	   << string_from_bool(requested_features.use_patch_evaluation) << std::endl;
-	os << "Use Transparent Shadows: "
-	   << string_from_bool(requested_features.use_transparent) << std::endl;
-	os << "Use Principled BSDF: "
-	   << string_from_bool(requested_features.use_principled) << std::endl;
-	os << "Use Denoising: "
-	   << string_from_bool(requested_features.use_denoising) << std::endl;
-	os << "Use Displacement: "
-	   << string_from_bool(requested_features.use_true_displacement) << std::endl;
-	os << "Use Background Light: "
-	   << string_from_bool(requested_features.use_background_light) << std::endl;
-	return os;
+  os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl;
+  os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
+  /* TODO(sergey): Decode bitflag into list of names. */
+  os << "Nodes features: " << requested_features.nodes_features << std::endl;
+  os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl;
+  os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion)
+     << std::endl;
+  os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion)
+     << std::endl;
+  os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl;
+  os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl;
+  os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl;
+  os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched)
+     << std::endl;
+  os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation)
+     << std::endl;
+  os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent)
+     << std::endl;
+  os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled)
+     << std::endl;
+  os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl;
+  os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement)
+     << std::endl;
+  os << "Use Background Light: " << string_from_bool(requested_features.use_background_light)
+     << std::endl;
+  return os;
 }
 
 /* Device */
 
 Device::~Device()
 {
-	if(!background) {
-		if(vertex_buffer != 0) {
-			glDeleteBuffers(1, &vertex_buffer);
-		}
-		if(fallback_shader_program != 0) {
-			glDeleteProgram(fallback_shader_program);
-		}
-	}
+  if (!background) {
+    if (vertex_buffer != 0) {
+      glDeleteBuffers(1, &vertex_buffer);
+    }
+    if (fallback_shader_program != 0) {
+      glDeleteProgram(fallback_shader_program);
+    }
+  }
 }
 
 /* TODO move shaders to standalone .glsl file. */
 const char *FALLBACK_VERTEX_SHADER =
-"#version 330\n"
-"uniform vec2 fullscreen;\n"
-"in vec2 texCoord;\n"
-"in vec2 pos;\n"
-"out vec2 texCoord_interp;\n"
-"\n"
-"vec2 normalize_coordinates()\n"
-"{\n"
-"	return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
-"}\n"
-"\n"
-"void main()\n"
-"{\n"
-"	gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
-"	texCoord_interp = texCoord;\n"
-"}\n\0";
+    "#version 330\n"
+    "uniform vec2 fullscreen;\n"
+    "in vec2 texCoord;\n"
+    "in vec2 pos;\n"
+    "out vec2 texCoord_interp;\n"
+    "\n"
+    "vec2 normalize_coordinates()\n"
+    "{\n"
+    "   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
+    "}\n"
+    "\n"
+    "void main()\n"
+    "{\n"
+    "   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
+    "   texCoord_interp = texCoord;\n"
+    "}\n\0";
 
 const char *FALLBACK_FRAGMENT_SHADER =
-"#version 330\n"
-"uniform sampler2D image_texture;\n"
-"in vec2 texCoord_interp;\n"
-"out vec4 fragColor;\n"
-"\n"
-"void main()\n"
-"{\n"
-"	fragColor = texture(image_texture, texCoord_interp);\n"
-"}\n\0";
+    "#version 330\n"
+    "uniform sampler2D image_texture;\n"
+    "in vec2 texCoord_interp;\n"
+    "out vec4 fragColor;\n"
+    "\n"
+    "void main()\n"
+    "{\n"
+    "   fragColor = texture(image_texture, texCoord_interp);\n"
+    "}\n\0";
 
 static void shader_print_errors(const char *task, const char *log, const char *code)
 {
-	LOG(ERROR) << "Shader: " << task << " error:";
-	LOG(ERROR) << "===== shader string ====";
-
-	stringstream stream(code);
-	string partial;
-
-	int line = 1;
-	while(getline(stream, partial, '\n')) {
-		if(line < 10) {
-			LOG(ERROR) << " " << line << " " << partial;
-		}
-		else {
-			LOG(ERROR) << line << " " << partial;
-		}
-		line++;
-	}
-	LOG(ERROR) << log;
+  LOG(ERROR) << "Shader: " << task << " error:";
+  LOG(ERROR) << "===== shader string ====";
+
+  stringstream stream(code);
+  string partial;
+
+  int line = 1;
+  while (getline(stream, partial, '\n')) {
+    if (line < 10) {
+      LOG(ERROR) << " " << line << " " << partial;
+    }
+    else {
+      LOG(ERROR) << line << " " << partial;
+    }
+    line++;
+  }
+  LOG(ERROR) << log;
 }
 
 static int bind_fallback_shader(void)
 {
-	GLint status;
-	GLchar log[5000];
-	GLsizei length = 0;
-	GLuint program = 0;
+  GLint status;
+  GLchar log[5000];
+  GLsizei length = 0;
+  GLuint program = 0;
 
-	struct Shader {
-		const char *source;
-		GLenum type;
-	} shaders[2] = {
-	    {FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
-	    {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}
-    };
+  struct Shader {
+    const char *source;
+    GLenum type;
+  } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
+                  {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
 
-	program = glCreateProgram();
+  program = glCreateProgram();
 
-	for(int i = 0; i < 2; i++) {
-		GLuint shader = glCreateShader(shaders[i].type);
+  for (int i = 0; i < 2; i++) {
+    GLuint shader = glCreateShader(shaders[i].type);
 
-		string source_str = shaders[i].source;
-		const char *c_str = source_str.c_str();
+    string source_str = shaders[i].source;
+    const char *c_str = source_str.c_str();
 
-		glShaderSource(shader, 1, &c_str, NULL);
-		glCompileShader(shader);
+    glShaderSource(shader, 1, &c_str, NULL);
+    glCompileShader(shader);
 
-		glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
+    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
 
-		if(!status) {
-			glGetShaderInfoLog(shader, sizeof(log), &length, log);
-			shader_print_errors("compile", log, c_str);
-			return 0;
-		}
+    if (!status) {
+      glGetShaderInfoLog(shader, sizeof(log), &length, log);
+      shader_print_errors("compile", log, c_str);
+      return 0;
+    }
 
-		glAttachShader(program, shader);
-	}
+    glAttachShader(program, shader);
+  }
 
-	/* Link output. */
-	glBindFragDataLocation(program, 0, "fragColor");
+  /* Link output. */
+  glBindFragDataLocation(program, 0, "fragColor");
 
-	/* Link and error check. */
-	glLinkProgram(program);
+  /* Link and error check. */
+  glLinkProgram(program);
 
-	glGetProgramiv(program, GL_LINK_STATUS, &status);
-	if(!status) {
-		glGetShaderInfoLog(program, sizeof(log), &length, log);
-		shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
-		shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
-		return 0;
-	}
+  glGetProgramiv(program, GL_LINK_STATUS, &status);
+  if (!status) {
+    glGetShaderInfoLog(program, sizeof(log), &length, log);
+    shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
+    shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
+    return 0;
+  }
 
-	return program;
+  return program;
 }
 
 bool Device::bind_fallback_display_space_shader(const float width, const float height)
 {
-	if(fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
-		return false;
-	}
-
-	if(fallback_status == FALLBACK_SHADER_STATUS_NONE) {
-		fallback_shader_program = bind_fallback_shader();
-		fallback_status = FALLBACK_SHADER_STATUS_ERROR;
-
-		if(fallback_shader_program == 0) {
-			return false;
-		}
-
-		glUseProgram(fallback_shader_program);
-		image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
-		if(image_texture_location < 0) {
-			LOG(ERROR) << "Shader doesn't containt the 'image_texture' uniform.";
-			return false;
-		}
-
-		fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
-		if(fullscreen_location < 0) {
-			LOG(ERROR) << "Shader doesn't containt the 'fullscreen' uniform.";
-			return false;
-		}
-
-		fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
-	}
-
-	/* Run this every time. */
-	glUseProgram(fallback_shader_program);
-	glUniform1i(image_texture_location, 0);
-	glUniform2f(fullscreen_location, width, height);
-	return true;
+  if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
+    return false;
+  }
+
+  if (fallback_status == FALLBACK_SHADER_STATUS_NONE) {
+    fallback_shader_program = bind_fallback_shader();
+    fallback_status = FALLBACK_SHADER_STATUS_ERROR;
+
+    if (fallback_shader_program == 0) {
+      return false;
+    }
+
+    glUseProgram(fallback_shader_program);
+    image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
+    if (image_texture_location < 0) {
+      LOG(ERROR) << "Shader doesn't containt the 'image_texture' uniform.";
+      return false;
+    }
+
+    fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
+    if (fullscreen_location < 0) {
+      LOG(ERROR) << "Shader doesn't containt the 'fullscreen' uniform.";
+      return false;
+    }
+
+    fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
+  }
+
+  /* Run this every time. */
+  glUseProgram(fallback_shader_program);
+  glUniform1i(image_texture_location, 0);
+  glUniform2f(fullscreen_location, width, height);
+  return true;
 }
 
-void Device::draw_pixels(
-    device_memory& rgba, int y,
-    int w, int h, int width, int height,
-    int dx, int dy, int dw, int dh,
-    bool transparent, const DeviceDrawParams &draw_params)
+void Device::draw_pixels(device_memory &rgba,
+                         int y,
+                         int w,
+                         int h,
+                         int width,
+                         int height,
+                         int dx,
+                         int dy,
+                         int dw,
+                         int dh,
+                         bool transparent,
+                         const DeviceDrawParams &draw_params)
 {
-	const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-
-	assert(rgba.type == MEM_PIXELS);
-	mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
-
-	GLuint texid;
-	glActiveTexture(GL_TEXTURE0);
-	glGenTextures(1, &texid);
-	glBindTexture(GL_TEXTURE_2D, texid);
-
-	if(rgba.data_type == TYPE_HALF) {
-		GLhalf *data_pointer = (GLhalf*)rgba.host_pointer;
-		data_pointer += 4 * y * w;
-		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
-	}
-	else {
-		uint8_t *data_pointer = (uint8_t*)rgba.host_pointer;
-		data_pointer += 4 * y * w;
-		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
-	}
-
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
-	if(transparent) {
-		glEnable(GL_BLEND);
-		glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-	}
-
-	GLint shader_program;
-	if(use_fallback_shader) {
-		if(!bind_fallback_display_space_shader(dw, dh)) {
-			return;
-		}
-		shader_program = fallback_shader_program;
-	}
-	else {
-		draw_params.bind_display_space_shader_cb();
-		glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-	}
-
-	if(!vertex_buffer) {
-		glGenBuffers(1, &vertex_buffer);
-	}
-
-	glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-	/* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
-	glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-	float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-	if(vpointer) {
-		/* texture coordinate - vertex pair */
-		vpointer[0] = 0.0f;
-		vpointer[1] = 0.0f;
-		vpointer[2] = dx;
-		vpointer[3] = dy;
-
-		vpointer[4] = 1.0f;
-		vpointer[5] = 0.0f;
-		vpointer[6] = (float)width + dx;
-		vpointer[7] = dy;
-
-		vpointer[8] = 1.0f;
-		vpointer[9] = 1.0f;
-		vpointer[10] = (float)width + dx;
-		vpointer[11] = (float)height + dy;
-
-		vpointer[12] = 0.0f;
-		vpointer[13] = 1.0f;
-		vpointer[14] = dx;
-		vpointer[15] = (float)height + dy;
-
-		if(vertex_buffer) {
-			glUnmapBuffer(GL_ARRAY_BUFFER);
-		}
-	}
-
-	GLuint vertex_array_object;
-	GLuint position_attribute, texcoord_attribute;
-
-	glGenVertexArrays(1, &vertex_array_object);
-	glBindVertexArray(vertex_array_object);
-
-	texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-	position_attribute = glGetAttribLocation(shader_program, "pos");
-
-	glEnableVertexAttribArray(texcoord_attribute);
-	glEnableVertexAttribArray(position_attribute);
-
-	glVertexAttribPointer(texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-	glVertexAttribPointer(position_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)(sizeof(float) * 2));
-
-	glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-	if(vertex_buffer) {
-		glBindBuffer(GL_ARRAY_BUFFER, 0);
-	}
-
-	if(use_fallback_shader) {
-		glUseProgram(0);
-	}
-	else {
-		draw_params.unbind_display_space_shader_cb();
-	}
-
-	glDeleteVertexArrays(1, &vertex_array_object);
-	glBindTexture(GL_TEXTURE_2D, 0);
-	glDeleteTextures(1, &texid);
-
-	if(transparent) {
-		glDisable(GL_BLEND);
-	}
+  const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
+
+  assert(rgba.type == MEM_PIXELS);
+  mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
+
+  GLuint texid;
+  glActiveTexture(GL_TEXTURE0);
+  glGenTextures(1, &texid);
+  glBindTexture(GL_TEXTURE_2D, texid);
+
+  if (rgba.data_type == TYPE_HALF) {
+    GLhalf *data_pointer = (GLhalf *)rgba.host_pointer;
+    data_pointer += 4 * y * w;
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
+  }
+  else {
+    uint8_t *data_pointer = (uint8_t *)rgba.host_pointer;
+    data_pointer += 4 * y * w;
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
+  }
+
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+  if (transparent) {
+    glEnable(GL_BLEND);
+    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+  }
+
+  GLint shader_program;
+  if (use_fallback_shader) {
+    if (!bind_fallback_display_space_shader(dw, dh)) {
+      return;
+    }
+    shader_program = fallback_shader_program;
+  }
+  else {
+    draw_params.bind_display_space_shader_cb();
+    glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
+  }
+
+  if (!vertex_buffer) {
+    glGenBuffers(1, &vertex_buffer);
+  }
+
+  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+  /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
+  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+  float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+
+  if (vpointer) {
+    /* texture coordinate - vertex pair */
+    vpointer[0] = 0.0f;
+    vpointer[1] = 0.0f;
+    vpointer[2] = dx;
+    vpointer[3] = dy;
+
+    vpointer[4] = 1.0f;
+    vpointer[5] = 0.0f;
+    vpointer[6] = (float)width + dx;
+    vpointer[7] = dy;
+
+    vpointer[8] = 1.0f;
+    vpointer[9] = 1.0f;
+    vpointer[10] = (float)width + dx;
+    vpointer[11] = (float)height + dy;
+
+    vpointer[12] = 0.0f;
+    vpointer[13] = 1.0f;
+    vpointer[14] = dx;
+    vpointer[15] = (float)height + dy;
+
+    if (vertex_buffer) {
+      glUnmapBuffer(GL_ARRAY_BUFFER);
+    }
+  }
+
+  GLuint vertex_array_object;
+  GLuint position_attribute, texcoord_attribute;
+
+  glGenVertexArrays(1, &vertex_array_object);
+  glBindVertexArray(vertex_array_object);
+
+  texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
+  position_attribute = glGetAttribLocation(shader_program, "pos");
+
+  glEnableVertexAttribArray(texcoord_attribute);
+  glEnableVertexAttribArray(position_attribute);
+
+  glVertexAttribPointer(
+      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+  glVertexAttribPointer(position_attribute,
+                        2,
+                        GL_FLOAT,
+                        GL_FALSE,
+                        4 * sizeof(float),
+                        (const GLvoid *)(sizeof(float) * 2));
+
+  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+  if (vertex_buffer) {
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+  }
+
+  if (use_fallback_shader) {
+    glUseProgram(0);
+  }
+  else {
+    draw_params.unbind_display_space_shader_cb();
+  }
+
+  glDeleteVertexArrays(1, &vertex_array_object);
+  glBindTexture(GL_TEXTURE_2D, 0);
+  glDeleteTextures(1, &texid);
+
+  if (transparent) {
+    glDisable(GL_BLEND);
+  }
 }
 
-Device *Device::create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
+Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
 {
-	Device *device;
+  Device *device;
 
-	switch(info.type) {
-		case DEVICE_CPU:
-			device = device_cpu_create(info, stats, profiler, background);
-			break;
+  switch (info.type) {
+    case DEVICE_CPU:
+      device = device_cpu_create(info, stats, profiler, background);
+      break;
 #ifdef WITH_CUDA
-		case DEVICE_CUDA:
-			if(device_cuda_init())
-				device = device_cuda_create(info, stats, profiler, background);
-			else
-				device = NULL;
-			break;
+    case DEVICE_CUDA:
+      if (device_cuda_init())
+        device = device_cuda_create(info, stats, profiler, background);
+      else
+        device = NULL;
+      break;
 #endif
 #ifdef WITH_MULTI
-		case DEVICE_MULTI:
-			device = device_multi_create(info, stats, profiler, background);
-			break;
+    case DEVICE_MULTI:
+      device = device_multi_create(info, stats, profiler, background);
+      break;
 #endif
 #ifdef WITH_NETWORK
-		case DEVICE_NETWORK:
-			device = device_network_create(info, stats, profiler, "127.0.0.1");
-			break;
+    case DEVICE_NETWORK:
+      device = device_network_create(info, stats, profiler, "127.0.0.1");
+      break;
 #endif
 #ifdef WITH_OPENCL
-		case DEVICE_OPENCL:
-			if(device_opencl_init())
-				device = device_opencl_create(info, stats, profiler, background);
-			else
-				device = NULL;
-			break;
+    case DEVICE_OPENCL:
+      if (device_opencl_init())
+        device = device_opencl_create(info, stats, profiler, background);
+      else
+        device = NULL;
+      break;
 #endif
-		default:
-			return NULL;
-	}
+    default:
+      return NULL;
+  }
 
-	return device;
+  return device;
 }
 
 DeviceType Device::type_from_string(const char *name)
 {
-	if(strcmp(name, "CPU") == 0)
-		return DEVICE_CPU;
-	else if(strcmp(name, "CUDA") == 0)
-		return DEVICE_CUDA;
-	else if(strcmp(name, "OPENCL") == 0)
-		return DEVICE_OPENCL;
-	else if(strcmp(name, "NETWORK") == 0)
-		return DEVICE_NETWORK;
-	else if(strcmp(name, "MULTI") == 0)
-		return DEVICE_MULTI;
-
-	return DEVICE_NONE;
+  if (strcmp(name, "CPU") == 0)
+    return DEVICE_CPU;
+  else if (strcmp(name, "CUDA") == 0)
+    return DEVICE_CUDA;
+  else if (strcmp(name, "OPENCL") == 0)
+    return DEVICE_OPENCL;
+  else if (strcmp(name, "NETWORK") == 0)
+    return DEVICE_NETWORK;
+  else if (strcmp(name, "MULTI") == 0)
+    return DEVICE_MULTI;
+
+  return DEVICE_NONE;
 }
 
 string Device::string_from_type(DeviceType type)
 {
-	if(type == DEVICE_CPU)
-		return "CPU";
-	else if(type == DEVICE_CUDA)
-		return "CUDA";
-	else if(type == DEVICE_OPENCL)
-		return "OPENCL";
-	else if(type == DEVICE_NETWORK)
-		return "NETWORK";
-	else if(type == DEVICE_MULTI)
-		return "MULTI";
-
-	return "";
+  if (type == DEVICE_CPU)
+    return "CPU";
+  else if (type == DEVICE_CUDA)
+    return "CUDA";
+  else if (type == DEVICE_OPENCL)
+    return "OPENCL";
+  else if (type == DEVICE_NETWORK)
+    return "NETWORK";
+  else if (type == DEVICE_MULTI)
+    return "MULTI";
+
+  return "";
 }
 
 vector<DeviceType> Device::available_types()
 {
-	vector<DeviceType> types;
-	types.push_back(DEVICE_CPU);
+  vector<DeviceType> types;
+  types.push_back(DEVICE_CPU);
 #ifdef WITH_CUDA
-	types.push_back(DEVICE_CUDA);
+  types.push_back(DEVICE_CUDA);
 #endif
 #ifdef WITH_OPENCL
-	types.push_back(DEVICE_OPENCL);
+  types.push_back(DEVICE_OPENCL);
 #endif
 #ifdef WITH_NETWORK
-	types.push_back(DEVICE_NETWORK);
+  types.push_back(DEVICE_NETWORK);
 #endif
-	return types;
+  return types;
 }
 
 vector<DeviceInfo> Device::available_devices(uint mask)
 {
-	/* Lazy initialize devices. On some platforms OpenCL or CUDA drivers can
-	 * be broken and cause crashes when only trying to get device info, so
-	 * we don't want to do any initialization until the user chooses to. */
-	thread_scoped_lock lock(device_mutex);
-	vector<DeviceInfo> devices;
+  /* Lazy initialize devices. On some platforms OpenCL or CUDA drivers can
+   * be broken and cause crashes when only trying to get device info, so
+   * we don't want to do any initialization until the user chooses to. */
+  thread_scoped_lock lock(device_mutex);
+  vector<DeviceInfo> devices;
 
 #ifdef WITH_OPENCL
-	if(mask & DEVICE_MASK_OPENCL) {
-		if(!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
-			if(device_opencl_init()) {
-				device_opencl_info(opencl_devices);
-			}
-			devices_initialized_mask |= DEVICE_MASK_OPENCL;
-		}
-		foreach(DeviceInfo& info, opencl_devices) {
-			devices.push_back(info);
-		}
-	}
+  if (mask & DEVICE_MASK_OPENCL) {
+    if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
+      if (device_opencl_init()) {
+        device_opencl_info(opencl_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_OPENCL;
+    }
+    foreach (DeviceInfo &info, opencl_devices) {
+      devices.push_back(info);
+    }
+  }
 #endif
 
 #ifdef WITH_CUDA
-	if(mask & DEVICE_MASK_CUDA) {
-		if(!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
-			if(device_cuda_init()) {
-				device_cuda_info(cuda_devices);
-			}
-			devices_initialized_mask |= DEVICE_MASK_CUDA;
-		}
-		foreach(DeviceInfo& info, cuda_devices) {
-			devices.push_back(info);
-		}
-	}
+  if (mask & DEVICE_MASK_CUDA) {
+    if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
+      if (device_cuda_init()) {
+        device_cuda_info(cuda_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_CUDA;
+    }
+    foreach (DeviceInfo &info, cuda_devices) {
+      devices.push_back(info);
+    }
+  }
 #endif
 
-	if(mask & DEVICE_MASK_CPU) {
-		if(!(devices_initialized_mask & DEVICE_MASK_CPU)) {
-			device_cpu_info(cpu_devices);
-			devices_initialized_mask |= DEVICE_MASK_CPU;
-		}
-		foreach(DeviceInfo& info, cpu_devices) {
-			devices.push_back(info);
-		}
-	}
+  if (mask & DEVICE_MASK_CPU) {
+    if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
+      device_cpu_info(cpu_devices);
+      devices_initialized_mask |= DEVICE_MASK_CPU;
+    }
+    foreach (DeviceInfo &info, cpu_devices) {
+      devices.push_back(info);
+    }
+  }
 
 #ifdef WITH_NETWORK
-	if(mask & DEVICE_MASK_NETWORK) {
-		if(!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
-			device_network_info(network_devices);
-			devices_initialized_mask |= DEVICE_MASK_NETWORK;
-		}
-		foreach(DeviceInfo& info, network_devices) {
-			devices.push_back(info);
-		}
-	}
+  if (mask & DEVICE_MASK_NETWORK) {
+    if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
+      device_network_info(network_devices);
+      devices_initialized_mask |= DEVICE_MASK_NETWORK;
+    }
+    foreach (DeviceInfo &info, network_devices) {
+      devices.push_back(info);
+    }
+  }
 #endif
 
-	return devices;
+  return devices;
 }
 
 string Device::device_capabilities(uint mask)
 {
-	thread_scoped_lock lock(device_mutex);
-	string capabilities = "";
+  thread_scoped_lock lock(device_mutex);
+  string capabilities = "";
 
-	if(mask & DEVICE_MASK_CPU) {
-		capabilities += "\nCPU device capabilities: ";
-		capabilities += device_cpu_capabilities() + "\n";
-	}
+  if (mask & DEVICE_MASK_CPU) {
+    capabilities += "\nCPU device capabilities: ";
+    capabilities += device_cpu_capabilities() + "\n";
+  }
 
 #ifdef WITH_OPENCL
-	if(mask & DEVICE_MASK_OPENCL) {
-		if(device_opencl_init()) {
-			capabilities += "\nOpenCL device capabilities:\n";
-			capabilities += device_opencl_capabilities();
-		}
-	}
+  if (mask & DEVICE_MASK_OPENCL) {
+    if (device_opencl_init()) {
+      capabilities += "\nOpenCL device capabilities:\n";
+      capabilities += device_opencl_capabilities();
+    }
+  }
 #endif
 
 #ifdef WITH_CUDA
-	if(mask & DEVICE_MASK_CUDA) {
-		if(device_cuda_init()) {
-			capabilities += "\nCUDA device capabilities:\n";
-			capabilities += device_cuda_capabilities();
-		}
-	}
+  if (mask & DEVICE_MASK_CUDA) {
+    if (device_cuda_init()) {
+      capabilities += "\nCUDA device capabilities:\n";
+      capabilities += device_cuda_capabilities();
+    }
+  }
 #endif
 
-	return capabilities;
+  return capabilities;
 }
 
-DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int threads, bool background)
+DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
+                                    int threads,
+                                    bool background)
 {
-	assert(subdevices.size() > 0);
-
-	if(subdevices.size() == 1) {
-		/* No multi device needed. */
-		return subdevices.front();
-	}
-
-	DeviceInfo info;
-	info.type = DEVICE_MULTI;
-	info.id = "MULTI";
-	info.description = "Multi Device";
-	info.num = 0;
-
-	info.has_half_images = true;
-	info.has_volume_decoupled = true;
-	info.has_osl = true;
-	info.has_profiling = true;
-
-	foreach(const DeviceInfo &device, subdevices) {
-		/* Ensure CPU device does not slow down GPU. */
-		if(device.type == DEVICE_CPU && subdevices.size() > 1) {
-			if(background) {
-				int orig_cpu_threads = (threads)? threads: system_cpu_thread_count();
-				int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), 0);
-
-				VLOG(1) << "CPU render threads reduced from "
-						<< orig_cpu_threads << " to " << cpu_threads
-						<< ", to dedicate to GPU.";
-
-				if(cpu_threads >= 1) {
-					DeviceInfo cpu_device = device;
-					cpu_device.cpu_threads = cpu_threads;
-					info.multi_devices.push_back(cpu_device);
-				}
-				else {
-					continue;
-				}
-			}
-			else {
-				VLOG(1) << "CPU render threads disabled for interactive render.";
-				continue;
-			}
-		}
-		else {
-			info.multi_devices.push_back(device);
-		}
-
-		/* Accumulate device info. */
-		info.has_half_images &= device.has_half_images;
-		info.has_volume_decoupled &= device.has_volume_decoupled;
-		info.has_osl &= device.has_osl;
-		info.has_profiling &= device.has_profiling;
-	}
-
-	return info;
+  assert(subdevices.size() > 0);
+
+  if (subdevices.size() == 1) {
+    /* No multi device needed. */
+    return subdevices.front();
+  }
+
+  DeviceInfo info;
+  info.type = DEVICE_MULTI;
+  info.id = "MULTI";
+  info.description = "Multi Device";
+  info.num = 0;
+
+  info.has_half_images = true;
+  info.has_volume_decoupled = true;
+  info.has_osl = true;
+  info.has_profiling = true;
+
+  foreach (const DeviceInfo &device, subdevices) {
+    /* Ensure CPU device does not slow down GPU. */
+    if (device.type == DEVICE_CPU && subdevices.size() > 1) {
+      if (background) {
+        int orig_cpu_threads = (threads) ? threads : system_cpu_thread_count();
+        int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), 0);
+
+        VLOG(1) << "CPU render threads reduced from " << orig_cpu_threads << " to " << cpu_threads
+                << ", to dedicate to GPU.";
+
+        if (cpu_threads >= 1) {
+          DeviceInfo cpu_device = device;
+          cpu_device.cpu_threads = cpu_threads;
+          info.multi_devices.push_back(cpu_device);
+        }
+        else {
+          continue;
+        }
+      }
+      else {
+        VLOG(1) << "CPU render threads disabled for interactive render.";
+        continue;
+      }
+    }
+    else {
+      info.multi_devices.push_back(device);
+    }
+
+    /* Accumulate device info. */
+    info.has_half_images &= device.has_half_images;
+    info.has_volume_decoupled &= device.has_volume_decoupled;
+    info.has_osl &= device.has_osl;
+    info.has_profiling &= device.has_profiling;
+  }
+
+  return info;
 }
 
 void Device::tag_update()
 {
-	free_memory();
+  free_memory();
 }
 
 void Device::free_memory()
 {
-	devices_initialized_mask = 0;
-	cuda_devices.free_memory();
-	opencl_devices.free_memory();
-	cpu_devices.free_memory();
-	network_devices.free_memory();
+  devices_initialized_mask = 0;
+  cuda_devices.free_memory();
+  opencl_devices.free_memory();
+  cpu_devices.free_memory();
+  network_devices.free_memory();
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index aa0a8e434d2..15a0ceb4a19 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -40,384 +40,428 @@ class RenderTile;
 /* Device Types */
 
 enum DeviceType {
-	DEVICE_NONE = 0,
-	DEVICE_CPU,
-	DEVICE_OPENCL,
-	DEVICE_CUDA,
-	DEVICE_NETWORK,
-	DEVICE_MULTI
+  DEVICE_NONE = 0,
+  DEVICE_CPU,
+  DEVICE_OPENCL,
+  DEVICE_CUDA,
+  DEVICE_NETWORK,
+  DEVICE_MULTI
 };
 
 enum DeviceTypeMask {
-	DEVICE_MASK_CPU = (1 << DEVICE_CPU),
-	DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
-	DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
-	DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
-	DEVICE_MASK_ALL = ~0
+  DEVICE_MASK_CPU = (1 << DEVICE_CPU),
+  DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
+  DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
+  DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
+  DEVICE_MASK_ALL = ~0
 };
 
 enum DeviceKernelStatus {
-	DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL = 0,
-	DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
-	DEVICE_KERNEL_USING_FEATURE_KERNEL,
-	DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
-	DEVICE_KERNEL_UNKNOWN,
+  DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL = 0,
+  DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
+  DEVICE_KERNEL_USING_FEATURE_KERNEL,
+  DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
+  DEVICE_KERNEL_UNKNOWN,
 };
 
 #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)
 
 class DeviceInfo {
-public:
-	DeviceType type;
-	string description;
-	string id; /* used for user preferences, should stay fixed with changing hardware config */
-	int num;
-	bool display_device;            /* GPU is used as a display device. */
-	bool has_half_images;           /* Support half-float textures. */
-	bool has_volume_decoupled;      /* Decoupled volume shading. */
-	bool has_osl;                   /* Support Open Shading Language. */
-	bool use_split_kernel;          /* Use split or mega kernel. */
-	bool has_profiling;             /* Supports runtime collection of profiling info. */
-	int cpu_threads;
-	vector<DeviceInfo> multi_devices;
-
-	DeviceInfo()
-	{
-		type = DEVICE_CPU;
-		id = "CPU";
-		num = 0;
-		cpu_threads = 0;
-		display_device = false;
-		has_half_images = false;
-		has_volume_decoupled = false;
-		has_osl = false;
-		use_split_kernel = false;
-		has_profiling = false;
-	}
-
-	bool operator==(const DeviceInfo &info) {
-		/* Multiple Devices with the same ID would be very bad. */
-		assert(id != info.id || (type == info.type && num == info.num && description == info.description));
-		return id == info.id;
-	}
+ public:
+  DeviceType type;
+  string description;
+  string id; /* used for user preferences, should stay fixed with changing hardware config */
+  int num;
+  bool display_device;       /* GPU is used as a display device. */
+  bool has_half_images;      /* Support half-float textures. */
+  bool has_volume_decoupled; /* Decoupled volume shading. */
+  bool has_osl;              /* Support Open Shading Language. */
+  bool use_split_kernel;     /* Use split or mega kernel. */
+  bool has_profiling;        /* Supports runtime collection of profiling info. */
+  int cpu_threads;
+  vector<DeviceInfo> multi_devices;
+
+  DeviceInfo()
+  {
+    type = DEVICE_CPU;
+    id = "CPU";
+    num = 0;
+    cpu_threads = 0;
+    display_device = false;
+    has_half_images = false;
+    has_volume_decoupled = false;
+    has_osl = false;
+    use_split_kernel = false;
+    has_profiling = false;
+  }
+
+  bool operator==(const DeviceInfo &info)
+  {
+    /* Multiple Devices with the same ID would be very bad. */
+    assert(id != info.id ||
+           (type == info.type && num == info.num && description == info.description));
+    return id == info.id;
+  }
 };
 
 class DeviceRequestedFeatures {
-public:
-	/* Use experimental feature set. */
-	bool experimental;
-
-	/* Selective nodes compilation. */
-
-	/* Identifier of a node group up to which all the nodes needs to be
-	 * compiled in. Nodes from higher group indices will be ignores.
-	 */
-	int max_nodes_group;
-
-	/* Features bitfield indicating which features from the requested group
-	 * will be compiled in. Nodes which corresponds to features which are not
-	 * in this bitfield will be ignored even if they're in the requested group.
-	 */
-	int nodes_features;
-
-	/* BVH/sampling kernel features. */
-	bool use_hair;
-	bool use_object_motion;
-	bool use_camera_motion;
-
-	/* Denotes whether baking functionality is needed. */
-	bool use_baking;
-
-	/* Use subsurface scattering materials. */
-	bool use_subsurface;
-
-	/* Use volume materials. */
-	bool use_volume;
-
-	/* Use branched integrator. */
-	bool use_integrator_branched;
-
-	/* Use OpenSubdiv patch evaluation */
-	bool use_patch_evaluation;
-
-	/* Use Transparent shadows */
-	bool use_transparent;
-
-	/* Use various shadow tricks, such as shadow catcher. */
-	bool use_shadow_tricks;
-
-	/* Per-uber shader usage flags. */
-	bool use_principled;
-
-	/* Denoising features. */
-	bool use_denoising;
-
-	/* Use raytracing in shaders. */
-	bool use_shader_raytrace;
-
-	/* Use true displacement */
-	bool use_true_displacement;
-
-	/* Use background lights */
-	bool use_background_light;
-
-	DeviceRequestedFeatures()
-	{
-		/* TODO(sergey): Find more meaningful defaults. */
-		experimental = false;
-		max_nodes_group = 0;
-		nodes_features = 0;
-		use_hair = false;
-		use_object_motion = false;
-		use_camera_motion = false;
-		use_baking = false;
-		use_subsurface = false;
-		use_volume = false;
-		use_integrator_branched = false;
-		use_patch_evaluation = false;
-		use_transparent = false;
-		use_shadow_tricks = false;
-		use_principled = false;
-		use_denoising = false;
-		use_shader_raytrace = false;
-		use_true_displacement = false;
-		use_background_light = false;
-	}
-
-	bool modified(const DeviceRequestedFeatures& requested_features)
-	{
-		return !(experimental == requested_features.experimental &&
-		         max_nodes_group == requested_features.max_nodes_group &&
-		         nodes_features == requested_features.nodes_features &&
-		         use_hair == requested_features.use_hair &&
-		         use_object_motion == requested_features.use_object_motion &&
-		         use_camera_motion == requested_features.use_camera_motion &&
-		         use_baking == requested_features.use_baking &&
-		         use_subsurface == requested_features.use_subsurface &&
-		         use_volume == requested_features.use_volume &&
-		         use_integrator_branched == requested_features.use_integrator_branched &&
-		         use_patch_evaluation == requested_features.use_patch_evaluation &&
-		         use_transparent == requested_features.use_transparent &&
-		         use_shadow_tricks == requested_features.use_shadow_tricks &&
-		         use_principled == requested_features.use_principled &&
-		         use_denoising == requested_features.use_denoising &&
-		         use_shader_raytrace == requested_features.use_shader_raytrace &&
-		         use_true_displacement == requested_features.use_true_displacement &&
-		         use_background_light == requested_features.use_background_light);
-	}
-
-	/* Convert the requested features structure to a build options,
-	 * which could then be passed to compilers.
-	 */
-	string get_build_options() const
-	{
-		string build_options = "";
-		if(experimental) {
-			build_options += "-D__KERNEL_EXPERIMENTAL__ ";
-		}
-		build_options += "-D__NODES_MAX_GROUP__=" +
-			string_printf("%d", max_nodes_group);
-		build_options += " -D__NODES_FEATURES__=" +
-			string_printf("%d", nodes_features);
-		if(!use_hair) {
-			build_options += " -D__NO_HAIR__";
-		}
-		if(!use_object_motion) {
-			build_options += " -D__NO_OBJECT_MOTION__";
-		}
-		if(!use_camera_motion) {
-			build_options += " -D__NO_CAMERA_MOTION__";
-		}
-		if(!use_baking) {
-			build_options += " -D__NO_BAKING__";
-		}
-		if(!use_volume) {
-			build_options += " -D__NO_VOLUME__";
-		}
-		if(!use_subsurface) {
-			build_options += " -D__NO_SUBSURFACE__";
-		}
-		if(!use_integrator_branched) {
-			build_options += " -D__NO_BRANCHED_PATH__";
-		}
-		if(!use_patch_evaluation) {
-			build_options += " -D__NO_PATCH_EVAL__";
-		}
-		if(!use_transparent && !use_volume) {
-			build_options += " -D__NO_TRANSPARENT__";
-		}
-		if(!use_shadow_tricks) {
-			build_options += " -D__NO_SHADOW_TRICKS__";
-		}
-		if(!use_principled) {
-			build_options += " -D__NO_PRINCIPLED__";
-		}
-		if(!use_denoising) {
-			build_options += " -D__NO_DENOISING__";
-		}
-		if(!use_shader_raytrace) {
-			build_options += " -D__NO_SHADER_RAYTRACE__";
-		}
-		return build_options;
-	}
+ public:
+  /* Use experimental feature set. */
+  bool experimental;
+
+  /* Selective nodes compilation. */
+
+  /* Identifier of a node group up to which all the nodes needs to be
+   * compiled in. Nodes from higher group indices will be ignores.
+   */
+  int max_nodes_group;
+
+  /* Features bitfield indicating which features from the requested group
+   * will be compiled in. Nodes which corresponds to features which are not
+   * in this bitfield will be ignored even if they're in the requested group.
+   */
+  int nodes_features;
+
+  /* BVH/sampling kernel features. */
+  bool use_hair;
+  bool use_object_motion;
+  bool use_camera_motion;
+
+  /* Denotes whether baking functionality is needed. */
+  bool use_baking;
+
+  /* Use subsurface scattering materials. */
+  bool use_subsurface;
+
+  /* Use volume materials. */
+  bool use_volume;
+
+  /* Use branched integrator. */
+  bool use_integrator_branched;
+
+  /* Use OpenSubdiv patch evaluation */
+  bool use_patch_evaluation;
+
+  /* Use Transparent shadows */
+  bool use_transparent;
+
+  /* Use various shadow tricks, such as shadow catcher. */
+  bool use_shadow_tricks;
+
+  /* Per-uber shader usage flags. */
+  bool use_principled;
+
+  /* Denoising features. */
+  bool use_denoising;
+
+  /* Use raytracing in shaders. */
+  bool use_shader_raytrace;
+
+  /* Use true displacement */
+  bool use_true_displacement;
+
+  /* Use background lights */
+  bool use_background_light;
+
+  DeviceRequestedFeatures()
+  {
+    /* TODO(sergey): Find more meaningful defaults. */
+    experimental = false;
+    max_nodes_group = 0;
+    nodes_features = 0;
+    use_hair = false;
+    use_object_motion = false;
+    use_camera_motion = false;
+    use_baking = false;
+    use_subsurface = false;
+    use_volume = false;
+    use_integrator_branched = false;
+    use_patch_evaluation = false;
+    use_transparent = false;
+    use_shadow_tricks = false;
+    use_principled = false;
+    use_denoising = false;
+    use_shader_raytrace = false;
+    use_true_displacement = false;
+    use_background_light = false;
+  }
+
+  bool modified(const DeviceRequestedFeatures &requested_features)
+  {
+    return !(experimental == requested_features.experimental &&
+             max_nodes_group == requested_features.max_nodes_group &&
+             nodes_features == requested_features.nodes_features &&
+             use_hair == requested_features.use_hair &&
+             use_object_motion == requested_features.use_object_motion &&
+             use_camera_motion == requested_features.use_camera_motion &&
+             use_baking == requested_features.use_baking &&
+             use_subsurface == requested_features.use_subsurface &&
+             use_volume == requested_features.use_volume &&
+             use_integrator_branched == requested_features.use_integrator_branched &&
+             use_patch_evaluation == requested_features.use_patch_evaluation &&
+             use_transparent == requested_features.use_transparent &&
+             use_shadow_tricks == requested_features.use_shadow_tricks &&
+             use_principled == requested_features.use_principled &&
+             use_denoising == requested_features.use_denoising &&
+             use_shader_raytrace == requested_features.use_shader_raytrace &&
+             use_true_displacement == requested_features.use_true_displacement &&
+             use_background_light == requested_features.use_background_light);
+  }
+
+  /* Convert the requested features structure to a build options,
+   * which could then be passed to compilers.
+   */
+  string get_build_options() const
+  {
+    string build_options = "";
+    if (experimental) {
+      build_options += "-D__KERNEL_EXPERIMENTAL__ ";
+    }
+    build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group);
+    build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features);
+    if (!use_hair) {
+      build_options += " -D__NO_HAIR__";
+    }
+    if (!use_object_motion) {
+      build_options += " -D__NO_OBJECT_MOTION__";
+    }
+    if (!use_camera_motion) {
+      build_options += " -D__NO_CAMERA_MOTION__";
+    }
+    if (!use_baking) {
+      build_options += " -D__NO_BAKING__";
+    }
+    if (!use_volume) {
+      build_options += " -D__NO_VOLUME__";
+    }
+    if (!use_subsurface) {
+      build_options += " -D__NO_SUBSURFACE__";
+    }
+    if (!use_integrator_branched) {
+      build_options += " -D__NO_BRANCHED_PATH__";
+    }
+    if (!use_patch_evaluation) {
+      build_options += " -D__NO_PATCH_EVAL__";
+    }
+    if (!use_transparent && !use_volume) {
+      build_options += " -D__NO_TRANSPARENT__";
+    }
+    if (!use_shadow_tricks) {
+      build_options += " -D__NO_SHADOW_TRICKS__";
+    }
+    if (!use_principled) {
+      build_options += " -D__NO_PRINCIPLED__";
+    }
+    if (!use_denoising) {
+      build_options += " -D__NO_DENOISING__";
+    }
+    if (!use_shader_raytrace) {
+      build_options += " -D__NO_SHADER_RAYTRACE__";
+    }
+    return build_options;
+  }
 };
 
-std::ostream& operator <<(std::ostream &os,
-                          const DeviceRequestedFeatures& requested_features);
+std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features);
 
 /* Device */
 
 struct DeviceDrawParams {
-	function<void()> bind_display_space_shader_cb;
-	function<void()> unbind_display_space_shader_cb;
+  function<void()> bind_display_space_shader_cb;
+  function<void()> unbind_display_space_shader_cb;
 };
 
 class Device {
-	friend class device_sub_ptr;
-protected:
-	enum {
-		FALLBACK_SHADER_STATUS_NONE = 0,
-		FALLBACK_SHADER_STATUS_ERROR,
-		FALLBACK_SHADER_STATUS_SUCCESS,
-	};
-
-	Device(DeviceInfo& info_, Stats &stats_, Profiler &profiler_, bool background) : background(background),
-	    vertex_buffer(0),
-	    fallback_status(FALLBACK_SHADER_STATUS_NONE), fallback_shader_program(0),
-	    info(info_), stats(stats_), profiler(profiler_) {}
-
-	bool background;
-	string error_msg;
-
-	/* used for real time display */
-	unsigned int vertex_buffer;
-	int fallback_status, fallback_shader_program;
-	int image_texture_location, fullscreen_location;
-
-	bool bind_fallback_display_space_shader(const float width, const float height);
-
-	virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/)
-	{
-		/* Only required for devices that implement denoising. */
-		assert(false);
-		return (device_ptr) 0;
-	}
-	virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
-
-public:
-	virtual ~Device();
-
-	/* info */
-	DeviceInfo info;
-	virtual const string& error_message() { return error_msg; }
-	bool have_error() { return !error_message().empty(); }
-	virtual void set_error(const string& error)
-	{
-		if(!have_error()) {
-			error_msg = error;
-		}
-		fprintf(stderr, "%s\n", error.c_str());
-		fflush(stderr);
-	}
-	virtual bool show_samples() const { return false; }
-	virtual BVHLayoutMask get_bvh_layout_mask() const = 0;
-
-	/* statistics */
-	Stats &stats;
-	Profiler &profiler;
-
-	/* memory alignment */
-	virtual int mem_sub_ptr_alignment() { return MIN_ALIGNMENT_CPU_DATA_TYPES; }
-
-	/* constant memory */
-	virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
-
-	/* open shading language, only for CPU device */
-	virtual void *osl_memory() { return NULL; }
-
-	/* load/compile kernels, must be called before adding tasks */
-	virtual bool load_kernels(
-	        const DeviceRequestedFeatures& /*requested_features*/)
-	{ return true; }
-
-	/* Wait for device to become available to upload data and receive tasks
-	 * This method is used by the OpenCL device to load the
-	 * optimized kernels or when not (yet) available load the
-	 * generic kernels (only during foreground rendering) */
-	virtual bool wait_for_availability(
-	        const DeviceRequestedFeatures& /*requested_features*/)
-	{ return true; }
-	/* Check if there are 'better' kernels available to be used
-	 * We can switch over to these kernels
-	 * This method is used to determine if we can switch the preview kernels
-	 * to regular kernels */
-	virtual DeviceKernelStatus get_active_kernel_switch_state()
-	{ return DEVICE_KERNEL_USING_FEATURE_KERNEL; }
-
-	/* tasks */
-	virtual int get_split_task_count(DeviceTask& task) = 0;
-	virtual void task_add(DeviceTask& task) = 0;
-	virtual void task_wait() = 0;
-	virtual void task_cancel() = 0;
-
-	/* opengl drawing */
-	virtual void draw_pixels(device_memory& mem, int y,
-	    int w, int h, int width, int height,
-	    int dx, int dy, int dw, int dh,
-	    bool transparent, const DeviceDrawParams &draw_params);
+  friend class device_sub_ptr;
+
+ protected:
+  enum {
+    FALLBACK_SHADER_STATUS_NONE = 0,
+    FALLBACK_SHADER_STATUS_ERROR,
+    FALLBACK_SHADER_STATUS_SUCCESS,
+  };
+
+  Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background)
+      : background(background),
+        vertex_buffer(0),
+        fallback_status(FALLBACK_SHADER_STATUS_NONE),
+        fallback_shader_program(0),
+        info(info_),
+        stats(stats_),
+        profiler(profiler_)
+  {
+  }
+
+  bool background;
+  string error_msg;
+
+  /* used for real time display */
+  unsigned int vertex_buffer;
+  int fallback_status, fallback_shader_program;
+  int image_texture_location, fullscreen_location;
+
+  bool bind_fallback_display_space_shader(const float width, const float height);
+
+  virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
+  {
+    /* Only required for devices that implement denoising. */
+    assert(false);
+    return (device_ptr)0;
+  }
+  virtual void mem_free_sub_ptr(device_ptr /*ptr*/){};
+
+ public:
+  virtual ~Device();
+
+  /* info */
+  DeviceInfo info;
+  virtual const string &error_message()
+  {
+    return error_msg;
+  }
+  bool have_error()
+  {
+    return !error_message().empty();
+  }
+  virtual void set_error(const string &error)
+  {
+    if (!have_error()) {
+      error_msg = error;
+    }
+    fprintf(stderr, "%s\n", error.c_str());
+    fflush(stderr);
+  }
+  virtual bool show_samples() const
+  {
+    return false;
+  }
+  virtual BVHLayoutMask get_bvh_layout_mask() const = 0;
+
+  /* statistics */
+  Stats &stats;
+  Profiler &profiler;
+
+  /* memory alignment */
+  virtual int mem_sub_ptr_alignment()
+  {
+    return MIN_ALIGNMENT_CPU_DATA_TYPES;
+  }
+
+  /* constant memory */
+  virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
+
+  /* open shading language, only for CPU device */
+  virtual void *osl_memory()
+  {
+    return NULL;
+  }
+
+  /* load/compile kernels, must be called before adding tasks */
+  virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
+  {
+    return true;
+  }
+
+  /* Wait for device to become available to upload data and receive tasks
+   * This method is used by the OpenCL device to load the
+   * optimized kernels or when not (yet) available load the
+   * generic kernels (only during foreground rendering) */
+  virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/)
+  {
+    return true;
+  }
+  /* Check if there are 'better' kernels available to be used
+   * We can switch over to these kernels
+   * This method is used to determine if we can switch the preview kernels
+   * to regular kernels */
+  virtual DeviceKernelStatus get_active_kernel_switch_state()
+  {
+    return DEVICE_KERNEL_USING_FEATURE_KERNEL;
+  }
+
+  /* tasks */
+  virtual int get_split_task_count(DeviceTask &task) = 0;
+  virtual void task_add(DeviceTask &task) = 0;
+  virtual void task_wait() = 0;
+  virtual void task_cancel() = 0;
+
+  /* opengl drawing */
+  virtual void draw_pixels(device_memory &mem,
+                           int y,
+                           int w,
+                           int h,
+                           int width,
+                           int height,
+                           int dx,
+                           int dy,
+                           int dw,
+                           int dh,
+                           bool transparent,
+                           const DeviceDrawParams &draw_params);
 
 #ifdef WITH_NETWORK
-	/* networking */
-	void server_run();
+  /* networking */
+  void server_run();
 #endif
 
-	/* multi device */
-	virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
-	virtual int device_number(Device * /*sub_device*/) { return 0; }
-	virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
-	virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
-
-	/* static */
-	static Device *create(DeviceInfo& info, Stats &stats, Profiler& profiler, bool background = true);
-
-	static DeviceType type_from_string(const char *name);
-	static string string_from_type(DeviceType type);
-	static vector<DeviceType> available_types();
-	static vector<DeviceInfo> available_devices(uint device_type_mask = DEVICE_MASK_ALL);
-	static string device_capabilities(uint device_type_mask = DEVICE_MASK_ALL);
-	static DeviceInfo get_multi_device(const vector<DeviceInfo>& subdevices,
-	                                   int threads,
-	                                   bool background);
-
-	/* Tag devices lists for update. */
-	static void tag_update();
-
-	static void free_memory();
-
-protected:
-	/* Memory allocation, only accessed through device_memory. */
-	friend class MultiDevice;
-	friend class DeviceServer;
-	friend class device_memory;
-
-	virtual void mem_alloc(device_memory& mem) = 0;
-	virtual void mem_copy_to(device_memory& mem) = 0;
-	virtual void mem_copy_from(device_memory& mem,
-		int y, int w, int h, int elem) = 0;
-	virtual void mem_zero(device_memory& mem) = 0;
-	virtual void mem_free(device_memory& mem) = 0;
-
-private:
-	/* Indicted whether device types and devices lists were initialized. */
-	static bool need_types_update, need_devices_update;
-	static thread_mutex device_mutex;
-	static vector<DeviceInfo> cuda_devices;
-	static vector<DeviceInfo> opencl_devices;
-	static vector<DeviceInfo> cpu_devices;
-	static vector<DeviceInfo> network_devices;
-	static uint devices_initialized_mask;
+  /* multi device */
+  virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/)
+  {
+  }
+  virtual int device_number(Device * /*sub_device*/)
+  {
+    return 0;
+  }
+  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/)
+  {
+  }
+  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/)
+  {
+  }
+
+  /* static */
+  static Device *create(DeviceInfo &info,
+                        Stats &stats,
+                        Profiler &profiler,
+                        bool background = true);
+
+  static DeviceType type_from_string(const char *name);
+  static string string_from_type(DeviceType type);
+  static vector<DeviceType> available_types();
+  static vector<DeviceInfo> available_devices(uint device_type_mask = DEVICE_MASK_ALL);
+  static string device_capabilities(uint device_type_mask = DEVICE_MASK_ALL);
+  static DeviceInfo get_multi_device(const vector<DeviceInfo> &subdevices,
+                                     int threads,
+                                     bool background);
+
+  /* Tag devices lists for update. */
+  static void tag_update();
+
+  static void free_memory();
+
+ protected:
+  /* Memory allocation, only accessed through device_memory. */
+  friend class MultiDevice;
+  friend class DeviceServer;
+  friend class device_memory;
+
+  virtual void mem_alloc(device_memory &mem) = 0;
+  virtual void mem_copy_to(device_memory &mem) = 0;
+  virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) = 0;
+  virtual void mem_zero(device_memory &mem) = 0;
+  virtual void mem_free(device_memory &mem) = 0;
+
+ private:
+  /* Indicted whether device types and devices lists were initialized. */
+  static bool need_types_update, need_devices_update;
+  static thread_mutex device_mutex;
+  static vector<DeviceInfo> cuda_devices;
+  static vector<DeviceInfo> opencl_devices;
+  static vector<DeviceInfo> cpu_devices;
+  static vector<DeviceInfo> network_devices;
+  static uint devices_initialized_mask;
 };
 
 CCL_NAMESPACE_END
 
-#endif  /* __DEVICE_H__ */
+#endif /* __DEVICE_H__ */
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 73f1fc02b08..837a8186064 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -61,1087 +61,1183 @@ class CPUDevice;
 /* Has to be outside of the class to be shared across template instantiations. */
 static const char *logged_architecture = "";
 
-template<typename F>
-class KernelFunctions {
-public:
-	KernelFunctions()
-	{
-		kernel = (F)NULL;
-	}
-
-	KernelFunctions(F kernel_default,
-	                F kernel_sse2,
-	                F kernel_sse3,
-	                F kernel_sse41,
-	                F kernel_avx,
-	                F kernel_avx2)
-	{
-		const char *architecture_name = "default";
-		kernel = kernel_default;
-
-		/* Silence potential warnings about unused variables
-		 * when compiling without some architectures. */
-		(void) kernel_sse2;
-		(void) kernel_sse3;
-		(void) kernel_sse41;
-		(void) kernel_avx;
-		(void) kernel_avx2;
+template<typename F> class KernelFunctions {
+ public:
+  KernelFunctions()
+  {
+    kernel = (F)NULL;
+  }
+
+  KernelFunctions(
+      F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
+  {
+    const char *architecture_name = "default";
+    kernel = kernel_default;
+
+    /* Silence potential warnings about unused variables
+     * when compiling without some architectures. */
+    (void)kernel_sse2;
+    (void)kernel_sse3;
+    (void)kernel_sse41;
+    (void)kernel_avx;
+    (void)kernel_avx2;
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
-			architecture_name = "AVX2";
-			kernel = kernel_avx2;
-		}
-		else
+    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+      architecture_name = "AVX2";
+      kernel = kernel_avx2;
+    }
+    else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
-			architecture_name = "AVX";
-			kernel = kernel_avx;
-		}
-		else
+        if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
+      architecture_name = "AVX";
+      kernel = kernel_avx;
+    }
+    else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		if(DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
-			architecture_name = "SSE4.1";
-			kernel = kernel_sse41;
-		}
-		else
+        if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
+      architecture_name = "SSE4.1";
+      kernel = kernel_sse41;
+    }
+    else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
-			architecture_name = "SSE3";
-			kernel = kernel_sse3;
-		}
-		else
+        if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
+      architecture_name = "SSE3";
+      kernel = kernel_sse3;
+    }
+    else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
-			architecture_name = "SSE2";
-			kernel = kernel_sse2;
-		}
+        if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+      architecture_name = "SSE2";
+      kernel = kernel_sse2;
+    }
 #endif
 
-		if(strcmp(architecture_name, logged_architecture) != 0) {
-			VLOG(1) << "Will be using " << architecture_name << " kernels.";
-			logged_architecture = architecture_name;
-		}
-	}
-
-	inline F operator()() const {
-		assert(kernel);
-		return kernel;
-	}
-protected:
-	F kernel;
+    if (strcmp(architecture_name, logged_architecture) != 0) {
+      VLOG(1) << "Will be using " << architecture_name << " kernels.";
+      logged_architecture = architecture_name;
+    }
+  }
+
+  inline F operator()() const
+  {
+    assert(kernel);
+    return kernel;
+  }
+
+ protected:
+  F kernel;
 };
 
 class CPUSplitKernel : public DeviceSplitKernel {
-	CPUDevice *device;
-public:
-	explicit CPUSplitKernel(CPUDevice *device);
-
-	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
-	                                            RenderTile& rtile,
-	                                            int num_global_elements,
-	                                            device_memory& kernel_globals,
-	                                            device_memory& kernel_data_,
-	                                            device_memory& split_data,
-	                                            device_memory& ray_state,
-	                                            device_memory& queue_index,
-	                                            device_memory& use_queues_flag,
-	                                            device_memory& work_pool_wgs);
-
-	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
-	                                                       const DeviceRequestedFeatures&);
-	virtual int2 split_kernel_local_size();
-	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
-	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+  CPUDevice *device;
+
+ public:
+  explicit CPUSplitKernel(CPUDevice *device);
+
+  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                              RenderTile &rtile,
+                                              int num_global_elements,
+                                              device_memory &kernel_globals,
+                                              device_memory &kernel_data_,
+                                              device_memory &split_data,
+                                              device_memory &ray_state,
+                                              device_memory &queue_index,
+                                              device_memory &use_queues_flag,
+                                              device_memory &work_pool_wgs);
+
+  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
+                                                         const DeviceRequestedFeatures &);
+  virtual int2 split_kernel_local_size();
+  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
+  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
 };
 
-class CPUDevice : public Device
-{
-public:
-	TaskPool task_pool;
-	KernelGlobals kernel_globals;
+class CPUDevice : public Device {
+ public:
+  TaskPool task_pool;
+  KernelGlobals kernel_globals;
 
-	device_vector<TextureInfo> texture_info;
-	bool need_texture_info;
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
 
 #ifdef WITH_OSL
-	OSLGlobals osl_globals;
+  OSLGlobals osl_globals;
 #endif
 
-	bool use_split_kernel;
-
-	DeviceRequestedFeatures requested_features;
-
-	KernelFunctions<void(*)(KernelGlobals *, float *, int, int, int, int, int)>             path_trace_kernel;
-	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
-	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
-	KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>   shader_kernel;
-
-	KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)>  filter_divide_shadow_kernel;
-	KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, float, int*, int, int)>         filter_get_feature_kernel;
-	KernelFunctions<void(*)(int, int, int, int*, float*, float*, int, int*)>                                    filter_write_feature_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
-
-	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel;
-	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                              filter_nlm_blur_kernel;
-	KernelFunctions<void(*)(float*, float*, int*, int, int)>                                              filter_nlm_calc_weight_kernel;
-	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)>       filter_nlm_update_output_kernel;
-	KernelFunctions<void(*)(float*, float*, int*, int)>                                                   filter_nlm_normalize_kernel;
-
-	KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)>                   filter_construct_transform_kernel;
-	KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel;
-	KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                                            filter_finalize_kernel;
-
-	KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
-	                       int, int, int, int, int, int, int, int, ccl_global int*, int,
-	                       ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)>        data_init_kernel;
-	unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
+  bool use_split_kernel;
+
+  DeviceRequestedFeatures requested_features;
+
+  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
+  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
+      convert_to_half_float_kernel;
+  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
+      convert_to_byte_kernel;
+  KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
+      shader_kernel;
+
+  KernelFunctions<void (*)(
+      int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
+      filter_divide_shadow_kernel;
+  KernelFunctions<void (*)(
+      int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
+      filter_get_feature_kernel;
+  KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
+      filter_write_feature_kernel;
+  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
+      filter_detect_outliers_kernel;
+  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
+      filter_combine_halves_kernel;
+
+  KernelFunctions<void (*)(
+      int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
+      filter_nlm_calc_difference_kernel;
+  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
+  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
+  KernelFunctions<void (*)(
+      int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
+      filter_nlm_update_output_kernel;
+  KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
+
+  KernelFunctions<void (*)(
+      float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
+      filter_construct_transform_kernel;
+  KernelFunctions<void (*)(int,
+                           int,
+                           int,
+                           float *,
+                           float *,
+                           float *,
+                           int *,
+                           float *,
+                           float3 *,
+                           int *,
+                           int *,
+                           int,
+                           int,
+                           int,
+                           int,
+                           bool)>
+      filter_nlm_construct_gramian_kernel;
+  KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
+      filter_finalize_kernel;
+
+  KernelFunctions<void (*)(KernelGlobals *,
+                           ccl_constant KernelData *,
+                           ccl_global void *,
+                           int,
+                           ccl_global char *,
+                           int,
+                           int,
+                           int,
+                           int,
+                           int,
+                           int,
+                           int,
+                           int,
+                           ccl_global int *,
+                           int,
+                           ccl_global char *,
+                           ccl_global unsigned int *,
+                           unsigned int,
+                           ccl_global float *)>
+      data_init_kernel;
+  unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
 
 #define KERNEL_FUNCTIONS(name) \
-	      KERNEL_NAME_EVAL(cpu, name), \
-	      KERNEL_NAME_EVAL(cpu_sse2, name), \
-	      KERNEL_NAME_EVAL(cpu_sse3, name), \
-	      KERNEL_NAME_EVAL(cpu_sse41, name), \
-	      KERNEL_NAME_EVAL(cpu_avx, name), \
-	      KERNEL_NAME_EVAL(cpu_avx2, name)
-
-	CPUDevice(DeviceInfo& info_, Stats &stats_, Profiler &profiler_, bool background_)
-	: Device(info_, stats_, profiler_, background_),
-	  texture_info(this, "__texture_info", MEM_TEXTURE),
-#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
-	  REGISTER_KERNEL(path_trace),
-	  REGISTER_KERNEL(convert_to_half_float),
-	  REGISTER_KERNEL(convert_to_byte),
-	  REGISTER_KERNEL(shader),
-	  REGISTER_KERNEL(filter_divide_shadow),
-	  REGISTER_KERNEL(filter_get_feature),
-	  REGISTER_KERNEL(filter_write_feature),
-	  REGISTER_KERNEL(filter_detect_outliers),
-	  REGISTER_KERNEL(filter_combine_halves),
-	  REGISTER_KERNEL(filter_nlm_calc_difference),
-	  REGISTER_KERNEL(filter_nlm_blur),
-	  REGISTER_KERNEL(filter_nlm_calc_weight),
-	  REGISTER_KERNEL(filter_nlm_update_output),
-	  REGISTER_KERNEL(filter_nlm_normalize),
-	  REGISTER_KERNEL(filter_construct_transform),
-	  REGISTER_KERNEL(filter_nlm_construct_gramian),
-	  REGISTER_KERNEL(filter_finalize),
-	  REGISTER_KERNEL(data_init)
+  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
+      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
+      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+
+  CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
+      : Device(info_, stats_, profiler_, background_),
+        texture_info(this, "__texture_info", MEM_TEXTURE),
+#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
+        REGISTER_KERNEL(path_trace),
+        REGISTER_KERNEL(convert_to_half_float),
+        REGISTER_KERNEL(convert_to_byte),
+        REGISTER_KERNEL(shader),
+        REGISTER_KERNEL(filter_divide_shadow),
+        REGISTER_KERNEL(filter_get_feature),
+        REGISTER_KERNEL(filter_write_feature),
+        REGISTER_KERNEL(filter_detect_outliers),
+        REGISTER_KERNEL(filter_combine_halves),
+        REGISTER_KERNEL(filter_nlm_calc_difference),
+        REGISTER_KERNEL(filter_nlm_blur),
+        REGISTER_KERNEL(filter_nlm_calc_weight),
+        REGISTER_KERNEL(filter_nlm_update_output),
+        REGISTER_KERNEL(filter_nlm_normalize),
+        REGISTER_KERNEL(filter_construct_transform),
+        REGISTER_KERNEL(filter_nlm_construct_gramian),
+        REGISTER_KERNEL(filter_finalize),
+        REGISTER_KERNEL(data_init)
 #undef REGISTER_KERNEL
-	{
-		if(info.cpu_threads == 0) {
-			info.cpu_threads = TaskScheduler::num_threads();
-		}
+  {
+    if (info.cpu_threads == 0) {
+      info.cpu_threads = TaskScheduler::num_threads();
+    }
 
 #ifdef WITH_OSL
-		kernel_globals.osl = &osl_globals;
+    kernel_globals.osl = &osl_globals;
 #endif
-		use_split_kernel = DebugFlags().cpu.split_kernel;
-		if(use_split_kernel) {
-			VLOG(1) << "Will be using split kernel.";
-		}
-		need_texture_info = false;
-
-#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
-		REGISTER_SPLIT_KERNEL(path_init);
-		REGISTER_SPLIT_KERNEL(scene_intersect);
-		REGISTER_SPLIT_KERNEL(lamp_emission);
-		REGISTER_SPLIT_KERNEL(do_volume);
-		REGISTER_SPLIT_KERNEL(queue_enqueue);
-		REGISTER_SPLIT_KERNEL(indirect_background);
-		REGISTER_SPLIT_KERNEL(shader_setup);
-		REGISTER_SPLIT_KERNEL(shader_sort);
-		REGISTER_SPLIT_KERNEL(shader_eval);
-		REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
-		REGISTER_SPLIT_KERNEL(subsurface_scatter);
-		REGISTER_SPLIT_KERNEL(direct_lighting);
-		REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
-		REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
-		REGISTER_SPLIT_KERNEL(enqueue_inactive);
-		REGISTER_SPLIT_KERNEL(next_iteration_setup);
-		REGISTER_SPLIT_KERNEL(indirect_subsurface);
-		REGISTER_SPLIT_KERNEL(buffer_update);
+    use_split_kernel = DebugFlags().cpu.split_kernel;
+    if (use_split_kernel) {
+      VLOG(1) << "Will be using split kernel.";
+    }
+    need_texture_info = false;
+
+#define REGISTER_SPLIT_KERNEL(name) \
+  split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
+      KERNEL_FUNCTIONS(name))
+    REGISTER_SPLIT_KERNEL(path_init);
+    REGISTER_SPLIT_KERNEL(scene_intersect);
+    REGISTER_SPLIT_KERNEL(lamp_emission);
+    REGISTER_SPLIT_KERNEL(do_volume);
+    REGISTER_SPLIT_KERNEL(queue_enqueue);
+    REGISTER_SPLIT_KERNEL(indirect_background);
+    REGISTER_SPLIT_KERNEL(shader_setup);
+    REGISTER_SPLIT_KERNEL(shader_sort);
+    REGISTER_SPLIT_KERNEL(shader_eval);
+    REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
+    REGISTER_SPLIT_KERNEL(subsurface_scatter);
+    REGISTER_SPLIT_KERNEL(direct_lighting);
+    REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
+    REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
+    REGISTER_SPLIT_KERNEL(enqueue_inactive);
+    REGISTER_SPLIT_KERNEL(next_iteration_setup);
+    REGISTER_SPLIT_KERNEL(indirect_subsurface);
+    REGISTER_SPLIT_KERNEL(buffer_update);
 #undef REGISTER_SPLIT_KERNEL
 #undef KERNEL_FUNCTIONS
-	}
-
-	~CPUDevice()
-	{
-		task_pool.stop();
-		texture_info.free();
-	}
-
-	virtual bool show_samples() const
-	{
-		return (info.cpu_threads == 1);
-	}
-
-	virtual BVHLayoutMask get_bvh_layout_mask() const {
-		BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
-		if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
-			bvh_layout_mask |= BVH_LAYOUT_BVH4;
-		}
-		if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
-			bvh_layout_mask |= BVH_LAYOUT_BVH8;
-		}
+  }
+
+  ~CPUDevice()
+  {
+    task_pool.stop();
+    texture_info.free();
+  }
+
+  virtual bool show_samples() const
+  {
+    return (info.cpu_threads == 1);
+  }
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const
+  {
+    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
+    if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+      bvh_layout_mask |= BVH_LAYOUT_BVH4;
+    }
+    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+      bvh_layout_mask |= BVH_LAYOUT_BVH8;
+    }
 #ifdef WITH_EMBREE
-		bvh_layout_mask |= BVH_LAYOUT_EMBREE;
-#endif  /* WITH_EMBREE */
-		return bvh_layout_mask;
-	}
-
-	void load_texture_info()
-	{
-		if(need_texture_info) {
-			texture_info.copy_to_device();
-			need_texture_info = false;
-		}
-	}
-
-	void mem_alloc(device_memory& mem)
-	{
-		if(mem.type == MEM_TEXTURE) {
-			assert(!"mem_alloc not supported for textures.");
-		}
-		else {
-			if(mem.name) {
-				VLOG(1) << "Buffer allocate: " << mem.name << ", "
-						<< string_human_readable_number(mem.memory_size()) << " bytes. ("
-						<< string_human_readable_size(mem.memory_size()) << ")";
-			}
-
-			if(mem.type == MEM_DEVICE_ONLY) {
-				assert(!mem.host_pointer);
-				size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
-				void *data = util_aligned_malloc(mem.memory_size(), alignment);
-				mem.device_pointer = (device_ptr)data;
-			}
-			else {
-				mem.device_pointer = (device_ptr)mem.host_pointer;
-			}
-
-			mem.device_size = mem.memory_size();
-			stats.mem_alloc(mem.device_size);
-		}
-	}
-
-	void mem_copy_to(device_memory& mem)
-	{
-		if(mem.type == MEM_TEXTURE) {
-			tex_free(mem);
-			tex_alloc(mem);
-		}
-		else if(mem.type == MEM_PIXELS) {
-			assert(!"mem_copy_to not supported for pixels.");
-		}
-		else {
-			if(!mem.device_pointer) {
-				mem_alloc(mem);
-			}
-
-			/* copy is no-op */
-		}
-	}
-
-	void mem_copy_from(device_memory& /*mem*/,
-	                   int /*y*/, int /*w*/, int /*h*/,
-	                   int /*elem*/)
-	{
-		/* no-op */
-	}
-
-	void mem_zero(device_memory& mem)
-	{
-		if(!mem.device_pointer) {
-			mem_alloc(mem);
-		}
-
-		if(mem.device_pointer) {
-			memset((void*)mem.device_pointer, 0, mem.memory_size());
-		}
-	}
-
-	void mem_free(device_memory& mem)
-	{
-		if(mem.type == MEM_TEXTURE) {
-			tex_free(mem);
-		}
-		else if(mem.device_pointer) {
-			if(mem.type == MEM_DEVICE_ONLY) {
-				util_aligned_free((void*)mem.device_pointer);
-			}
-			mem.device_pointer = 0;
-			stats.mem_free(mem.device_size);
-			mem.device_size = 0;
-		}
-	}
-
-	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
-	{
-		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
-	}
-
-	void const_copy_to(const char *name, void *host, size_t size)
-	{
-		kernel_const_copy(&kernel_globals, name, host, size);
-	}
-
-	void tex_alloc(device_memory& mem)
-	{
-		VLOG(1) << "Texture allocate: " << mem.name << ", "
-		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-		        << string_human_readable_size(mem.memory_size()) << ")";
-
-		if(mem.interpolation == INTERPOLATION_NONE) {
-			/* Data texture. */
-			kernel_tex_copy(&kernel_globals,
-							mem.name,
-							mem.host_pointer,
-							mem.data_size);
-		}
-		else {
-			/* Image Texture. */
-			int flat_slot = 0;
-			if(string_startswith(mem.name, "__tex_image")) {
-				int pos =  string(mem.name).rfind("_");
-				flat_slot = atoi(mem.name + pos + 1);
-			}
-			else {
-				assert(0);
-			}
-
-			if(flat_slot >= texture_info.size()) {
-				/* Allocate some slots in advance, to reduce amount
-				 * of re-allocations. */
-				texture_info.resize(flat_slot + 128);
-			}
-
-			TextureInfo& info = texture_info[flat_slot];
-			info.data = (uint64_t)mem.host_pointer;
-			info.cl_buffer = 0;
-			info.interpolation = mem.interpolation;
-			info.extension = mem.extension;
-			info.width = mem.data_width;
-			info.height = mem.data_height;
-			info.depth = mem.data_depth;
-
-			need_texture_info = true;
-		}
-
-		mem.device_pointer = (device_ptr)mem.host_pointer;
-		mem.device_size = mem.memory_size();
-		stats.mem_alloc(mem.device_size);
-	}
-
-	void tex_free(device_memory& mem)
-	{
-		if(mem.device_pointer) {
-			mem.device_pointer = 0;
-			stats.mem_free(mem.device_size);
-			mem.device_size = 0;
-			need_texture_info = true;
-		}
-	}
-
-	void *osl_memory()
-	{
+    bvh_layout_mask |= BVH_LAYOUT_EMBREE;
+#endif /* WITH_EMBREE */
+    return bvh_layout_mask;
+  }
+
+  void load_texture_info()
+  {
+    if (need_texture_info) {
+      texture_info.copy_to_device();
+      need_texture_info = false;
+    }
+  }
+
+  void mem_alloc(device_memory &mem)
+  {
+    if (mem.type == MEM_TEXTURE) {
+      assert(!"mem_alloc not supported for textures.");
+    }
+    else {
+      if (mem.name) {
+        VLOG(1) << "Buffer allocate: " << mem.name << ", "
+                << string_human_readable_number(mem.memory_size()) << " bytes. ("
+                << string_human_readable_size(mem.memory_size()) << ")";
+      }
+
+      if (mem.type == MEM_DEVICE_ONLY) {
+        assert(!mem.host_pointer);
+        size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
+        void *data = util_aligned_malloc(mem.memory_size(), alignment);
+        mem.device_pointer = (device_ptr)data;
+      }
+      else {
+        mem.device_pointer = (device_ptr)mem.host_pointer;
+      }
+
+      mem.device_size = mem.memory_size();
+      stats.mem_alloc(mem.device_size);
+    }
+  }
+
+  void mem_copy_to(device_memory &mem)
+  {
+    if (mem.type == MEM_TEXTURE) {
+      tex_free(mem);
+      tex_alloc(mem);
+    }
+    else if (mem.type == MEM_PIXELS) {
+      assert(!"mem_copy_to not supported for pixels.");
+    }
+    else {
+      if (!mem.device_pointer) {
+        mem_alloc(mem);
+      }
+
+      /* copy is no-op */
+    }
+  }
+
+  void mem_copy_from(device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+  {
+    /* no-op */
+  }
+
+  void mem_zero(device_memory &mem)
+  {
+    if (!mem.device_pointer) {
+      mem_alloc(mem);
+    }
+
+    if (mem.device_pointer) {
+      memset((void *)mem.device_pointer, 0, mem.memory_size());
+    }
+  }
+
+  void mem_free(device_memory &mem)
+  {
+    if (mem.type == MEM_TEXTURE) {
+      tex_free(mem);
+    }
+    else if (mem.device_pointer) {
+      if (mem.type == MEM_DEVICE_ONLY) {
+        util_aligned_free((void *)mem.device_pointer);
+      }
+      mem.device_pointer = 0;
+      stats.mem_free(mem.device_size);
+      mem.device_size = 0;
+    }
+  }
+
+  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+  {
+    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size)
+  {
+    kernel_const_copy(&kernel_globals, name, host, size);
+  }
+
+  void tex_alloc(device_memory &mem)
+  {
+    VLOG(1) << "Texture allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    if (mem.interpolation == INTERPOLATION_NONE) {
+      /* Data texture. */
+      kernel_tex_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
+    }
+    else {
+      /* Image Texture. */
+      int flat_slot = 0;
+      if (string_startswith(mem.name, "__tex_image")) {
+        int pos = string(mem.name).rfind("_");
+        flat_slot = atoi(mem.name + pos + 1);
+      }
+      else {
+        assert(0);
+      }
+
+      if (flat_slot >= texture_info.size()) {
+        /* Allocate some slots in advance, to reduce amount
+         * of re-allocations. */
+        texture_info.resize(flat_slot + 128);
+      }
+
+      TextureInfo &info = texture_info[flat_slot];
+      info.data = (uint64_t)mem.host_pointer;
+      info.cl_buffer = 0;
+      info.interpolation = mem.interpolation;
+      info.extension = mem.extension;
+      info.width = mem.data_width;
+      info.height = mem.data_height;
+      info.depth = mem.data_depth;
+
+      need_texture_info = true;
+    }
+
+    mem.device_pointer = (device_ptr)mem.host_pointer;
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+  }
+
+  void tex_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      mem.device_pointer = 0;
+      stats.mem_free(mem.device_size);
+      mem.device_size = 0;
+      need_texture_info = true;
+    }
+  }
+
+  void *osl_memory()
+  {
 #ifdef WITH_OSL
-		return &osl_globals;
+    return &osl_globals;
 #else
-		return NULL;
+    return NULL;
 #endif
-	}
-
-	void thread_run(DeviceTask *task)
-	{
-		if(task->type == DeviceTask::RENDER) {
-			thread_render(*task);
-		}
-		else if(task->type == DeviceTask::FILM_CONVERT)
-			thread_film_convert(*task);
-		else if(task->type == DeviceTask::SHADER)
-			thread_shader(*task);
-	}
-
-	class CPUDeviceTask : public DeviceTask {
-	public:
-		CPUDeviceTask(CPUDevice *device, DeviceTask& task)
-		: DeviceTask(task)
-		{
-			run = function_bind(&CPUDevice::thread_run, device, this);
-		}
-	};
-
-	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
-	                               DenoisingTask *task)
-	{
-		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
-
-		int4 rect = task->rect;
-		int   r   = task->nlm_state.r;
-		int   f   = task->nlm_state.f;
-		float a   = task->nlm_state.a;
-		float k_2 = task->nlm_state.k_2;
-
-		int w = align_up(rect.z-rect.x, 4);
-		int h = rect.w-rect.y;
-		int stride = task->buffer.stride;
-		int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
-
-		float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
-		float *blurDifference = temporary_mem;
-		float *difference     = temporary_mem + task->buffer.pass_stride;
-		float *weightAccum    = temporary_mem + 2*task->buffer.pass_stride;
-
-		memset(weightAccum, 0, sizeof(float)*w*h);
-		memset((float*) out_ptr, 0, sizeof(float)*w*h);
-
-		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-			int dy = i / (2*r+1) - r;
-			int dx = i % (2*r+1) - r;
-
-			int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
-			filter_nlm_calc_difference_kernel()(dx, dy,
-			                                    (float*) guide_ptr,
-			                                    (float*) variance_ptr,
-			                                    NULL,
-			                                    difference,
-			                                    local_rect,
-			                                    w, channel_offset,
-			                                    0, a, k_2);
-
-			filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
-			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
-			filter_nlm_blur_kernel()       (difference, blurDifference, local_rect, w, f);
-
-			filter_nlm_update_output_kernel()(dx, dy,
-			                                  blurDifference,
-			                                  (float*) image_ptr,
-			                                  difference,
-			                                  (float*) out_ptr,
-			                                  weightAccum,
-			                                  local_rect,
-			                                  channel_offset,
-			                                  stride, f);
-		}
-
-		int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
-		filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
-
-		return true;
-	}
-
-	bool denoising_construct_transform(DenoisingTask *task)
-	{
-		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
-
-		for(int y = 0; y < task->filter_area.w; y++) {
-			for(int x = 0; x < task->filter_area.z; x++) {
-				filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
-				                                    task->tile_info,
-				                                    x + task->filter_area.x,
-				                                    y + task->filter_area.y,
-				                                    y*task->filter_area.z + x,
-				                                    (float*) task->storage.transform.device_pointer,
-				                                    (int*)   task->storage.rank.device_pointer,
-				                                    &task->rect.x,
-				                                    task->buffer.pass_stride,
-				                                    task->buffer.frame_stride,
-				                                    task->buffer.use_time,
-				                                    task->radius,
-				                                    task->pca_threshold);
-			}
-		}
-		return true;
-	}
-
-	bool denoising_accumulate(device_ptr color_ptr,
-	                          device_ptr color_variance_ptr,
-	                          device_ptr scale_ptr,
-	                          int frame,
-	                          DenoisingTask *task)
-	{
-		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
-
-		float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
-		float *difference     = temporary_mem;
-		float *blurDifference = temporary_mem + task->buffer.pass_stride;
-
-		int r = task->radius;
-		int frame_offset = frame * task->buffer.frame_stride;
-		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-			int dy = i / (2*r+1) - r;
-			int dx = i % (2*r+1) - r;
-
-			int local_rect[4] = {max(0, -dx), max(0, -dy),
-			                     task->reconstruction_state.source_w - max(0, dx),
-			                     task->reconstruction_state.source_h - max(0, dy)};
-			filter_nlm_calc_difference_kernel()(dx, dy,
-			                                    (float*) color_ptr,
-			                                    (float*) color_variance_ptr,
-			                                    (float*) scale_ptr,
-			                                    difference,
-			                                    local_rect,
-			                                    task->buffer.stride,
-			                                    task->buffer.pass_stride,
-			                                    frame_offset,
-			                                    1.0f,
-			                                    task->nlm_k_2);
-			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-			filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
-			filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-			filter_nlm_construct_gramian_kernel()(dx, dy,
-			                                      task->tile_info->frames[frame],
-			                                      blurDifference,
-			                                      (float*)  task->buffer.mem.device_pointer,
-			                                      (float*)  task->storage.transform.device_pointer,
-			                                      (int*)    task->storage.rank.device_pointer,
-			                                      (float*)  task->storage.XtWX.device_pointer,
-			                                      (float3*) task->storage.XtWY.device_pointer,
-			                                      local_rect,
-			                                      &task->reconstruction_state.filter_window.x,
-			                                      task->buffer.stride,
-			                                      4,
-			                                      task->buffer.pass_stride,
-			                                      frame_offset,
-			                                      task->buffer.use_time);
-		}
-
-		return true;
-	}
-
-	bool denoising_solve(device_ptr output_ptr,
-	                     DenoisingTask *task)
-	{
-		for(int y = 0; y < task->filter_area.w; y++) {
-			for(int x = 0; x < task->filter_area.z; x++) {
-				filter_finalize_kernel()(x,
-				                         y,
-				                         y*task->filter_area.z + x,
-				                         (float*)  output_ptr,
-				                         (int*)    task->storage.rank.device_pointer,
-				                         (float*)  task->storage.XtWX.device_pointer,
-				                         (float3*) task->storage.XtWY.device_pointer,
-				                         &task->reconstruction_state.buffer_params.x,
-				                         task->render_buffer.samples);
-			}
-		}
-		return true;
-	}
-
-	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
-	                              device_ptr mean_ptr, device_ptr variance_ptr,
-	                              int r, int4 rect, DenoisingTask *task)
-	{
-		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
-
-		for(int y = rect.y; y < rect.w; y++) {
-			for(int x = rect.x; x < rect.z; x++) {
-				filter_combine_halves_kernel()(x, y,
-				                               (float*) mean_ptr,
-				                               (float*) variance_ptr,
-				                               (float*) a_ptr,
-				                               (float*) b_ptr,
-				                               &rect.x,
-				                               r);
-			}
-		}
-		return true;
-	}
-
-	bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
-	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
-	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
-	{
-		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
-
-		for(int y = task->rect.y; y < task->rect.w; y++) {
-			for(int x = task->rect.x; x < task->rect.z; x++) {
-				filter_divide_shadow_kernel()(task->render_buffer.samples,
-				                              task->tile_info,
-				                              x, y,
-				                              (float*) a_ptr,
-				                              (float*) b_ptr,
-				                              (float*) sample_variance_ptr,
-				                              (float*) sv_variance_ptr,
-				                              (float*) buffer_variance_ptr,
-				                              &task->rect.x,
-				                              task->render_buffer.pass_stride,
-				                              task->render_buffer.offset);
-			}
-		}
-		return true;
-	}
-
-	bool denoising_get_feature(int mean_offset,
-	                           int variance_offset,
-	                           device_ptr mean_ptr,
-	                           device_ptr variance_ptr,
-	                           float scale,
-	                           DenoisingTask *task)
-	{
-		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
-
-		for(int y = task->rect.y; y < task->rect.w; y++) {
-			for(int x = task->rect.x; x < task->rect.z; x++) {
-				filter_get_feature_kernel()(task->render_buffer.samples,
-				                            task->tile_info,
-				                            mean_offset,
-				                            variance_offset,
-				                            x, y,
-				                            (float*) mean_ptr,
-				                            (float*) variance_ptr,
-				                            scale,
-				                            &task->rect.x,
-				                            task->render_buffer.pass_stride,
-				                            task->render_buffer.offset);
-			}
-		}
-		return true;
-	}
-
-	bool denoising_write_feature(int out_offset,
-	                             device_ptr from_ptr,
-	                             device_ptr buffer_ptr,
-	                             DenoisingTask *task)
-	{
-		for(int y = 0; y < task->filter_area.w; y++) {
-			for(int x = 0; x < task->filter_area.z; x++) {
-				filter_write_feature_kernel()(task->render_buffer.samples,
-				                              x + task->filter_area.x,
-				                              y + task->filter_area.y,
-				                              &task->reconstruction_state.buffer_params.x,
-				                              (float*) from_ptr,
-				                              (float*) buffer_ptr,
-				                              out_offset,
-				                              &task->rect.x);
-			}
-		}
-		return true;
-	}
-
-	bool denoising_detect_outliers(device_ptr image_ptr,
-	                               device_ptr variance_ptr,
-	                               device_ptr depth_ptr,
-	                               device_ptr output_ptr,
-	                               DenoisingTask *task)
-	{
-		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
-
-		for(int y = task->rect.y; y < task->rect.w; y++) {
-			for(int x = task->rect.x; x < task->rect.z; x++) {
-				filter_detect_outliers_kernel()(x, y,
-				                                (float*) image_ptr,
-				                                (float*) variance_ptr,
-				                                (float*) depth_ptr,
-				                                (float*) output_ptr,
-				                                &task->rect.x,
-				                                task->buffer.pass_stride);
-			}
-		}
-		return true;
-	}
-
-	void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
-	{
-		const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
-		scoped_timer timer(&tile.buffers->render_time);
-
-		Coverage coverage(kg, tile);
-		if(use_coverage) {
-			coverage.init_path_trace();
-		}
-
-		float *render_buffer = (float*)tile.buffer;
-		int start_sample = tile.start_sample;
-		int end_sample = tile.start_sample + tile.num_samples;
-
-		/* Needed for Embree. */
-		SIMD_SET_FLUSH_TO_ZERO;
-
-		for(int sample = start_sample; sample < end_sample; sample++) {
-			if(task.get_cancel() || task_pool.canceled()) {
-				if(task.need_finish_queue == false)
-					break;
-			}
-
-			for(int y = tile.y; y < tile.y + tile.h; y++) {
-				for(int x = tile.x; x < tile.x + tile.w; x++) {
-					if(use_coverage) {
-						coverage.init_pixel(x, y);
-					}
-					path_trace_kernel()(kg, render_buffer,
-					                    sample, x, y, tile.offset, tile.stride);
-				}
-			}
-
-			tile.sample = sample + 1;
-
-			task.update_progress(&tile, tile.w*tile.h);
-		}
-		if(use_coverage) {
-			coverage.finalize();
-		}
-	}
-
-	void denoise(DenoisingTask& denoising, RenderTile &tile)
-	{
-		ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
-
-		tile.sample = tile.start_sample + tile.num_samples;
-
-		denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
-		denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-		denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
-		denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-		denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-		denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-		denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-		denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-		denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-		denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
-		denoising.render_buffer.samples = tile.sample;
-		denoising.buffer.gpu_temporary_mem = false;
-
-		denoising.run_denoising(&tile);
-	}
-
-	void thread_render(DeviceTask& task)
-	{
-		if(task_pool.canceled()) {
-			if(task.need_finish_queue == false)
-				return;
-		}
-
-		/* allocate buffer for kernel globals */
-		device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
-		kgbuffer.alloc_to_device(1);
-
-		KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
-
-		profiler.add_state(&kg->profiler);
-
-		CPUSplitKernel *split_kernel = NULL;
-		if(use_split_kernel) {
-			split_kernel = new CPUSplitKernel(this);
-			if(!split_kernel->load_kernels(requested_features)) {
-				thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
-				kgbuffer.free();
-				delete split_kernel;
-				return;
-			}
-		}
-
-		RenderTile tile;
-		DenoisingTask denoising(this, task);
-		denoising.profiler = &kg->profiler;
-
-		while(task.acquire_tile(this, tile)) {
-			if(tile.task == RenderTile::PATH_TRACE) {
-				if(use_split_kernel) {
-					device_only_memory<uchar> void_buffer(this, "void_buffer");
-					split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
-				}
-				else {
-					path_trace(task, tile, kg);
-				}
-			}
-			else if(tile.task == RenderTile::DENOISE) {
-				denoise(denoising, tile);
-				task.update_progress(&tile, tile.w*tile.h);
-			}
-
-			task.release_tile(tile);
-
-			if(task_pool.canceled()) {
-				if(task.need_finish_queue == false)
-					break;
-			}
-		}
-
-		profiler.remove_state(&kg->profiler);
-
-		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
-		kg->~KernelGlobals();
-		kgbuffer.free();
-		delete split_kernel;
-	}
-
-	void thread_film_convert(DeviceTask& task)
-	{
-		float sample_scale = 1.0f/(task.sample + 1);
-
-		if(task.rgba_half) {
-			for(int y = task.y; y < task.y + task.h; y++)
-				for(int x = task.x; x < task.x + task.w; x++)
-					convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
-					                               sample_scale, x, y, task.offset, task.stride);
-		}
-		else {
-			for(int y = task.y; y < task.y + task.h; y++)
-				for(int x = task.x; x < task.x + task.w; x++)
-					convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
-					                         sample_scale, x, y, task.offset, task.stride);
-
-		}
-	}
-
-	void thread_shader(DeviceTask& task)
-	{
-		KernelGlobals kg = kernel_globals;
+  }
+
+  void thread_run(DeviceTask *task)
+  {
+    if (task->type == DeviceTask::RENDER) {
+      thread_render(*task);
+    }
+    else if (task->type == DeviceTask::FILM_CONVERT)
+      thread_film_convert(*task);
+    else if (task->type == DeviceTask::SHADER)
+      thread_shader(*task);
+  }
+
+  class CPUDeviceTask : public DeviceTask {
+   public:
+    CPUDeviceTask(CPUDevice *device, DeviceTask &task) : DeviceTask(task)
+    {
+      run = function_bind(&CPUDevice::thread_run, device, this);
+    }
+  };
+
+  bool denoising_non_local_means(device_ptr image_ptr,
+                                 device_ptr guide_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr out_ptr,
+                                 DenoisingTask *task)
+  {
+    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
+
+    int4 rect = task->rect;
+    int r = task->nlm_state.r;
+    int f = task->nlm_state.f;
+    float a = task->nlm_state.a;
+    float k_2 = task->nlm_state.k_2;
+
+    int w = align_up(rect.z - rect.x, 4);
+    int h = rect.w - rect.y;
+    int stride = task->buffer.stride;
+    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+
+    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
+    float *blurDifference = temporary_mem;
+    float *difference = temporary_mem + task->buffer.pass_stride;
+    float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
+
+    memset(weightAccum, 0, sizeof(float) * w * h);
+    memset((float *)out_ptr, 0, sizeof(float) * w * h);
+
+    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
+      int dy = i / (2 * r + 1) - r;
+      int dx = i % (2 * r + 1) - r;
+
+      int local_rect[4] = {
+          max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
+      filter_nlm_calc_difference_kernel()(dx,
+                                          dy,
+                                          (float *)guide_ptr,
+                                          (float *)variance_ptr,
+                                          NULL,
+                                          difference,
+                                          local_rect,
+                                          w,
+                                          channel_offset,
+                                          0,
+                                          a,
+                                          k_2);
+
+      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+      filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+
+      filter_nlm_update_output_kernel()(dx,
+                                        dy,
+                                        blurDifference,
+                                        (float *)image_ptr,
+                                        difference,
+                                        (float *)out_ptr,
+                                        weightAccum,
+                                        local_rect,
+                                        channel_offset,
+                                        stride,
+                                        f);
+    }
+
+    int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
+    filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
+
+    return true;
+  }
+
+  bool denoising_construct_transform(DenoisingTask *task)
+  {
+    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
+
+    for (int y = 0; y < task->filter_area.w; y++) {
+      for (int x = 0; x < task->filter_area.z; x++) {
+        filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
+                                            task->tile_info,
+                                            x + task->filter_area.x,
+                                            y + task->filter_area.y,
+                                            y * task->filter_area.z + x,
+                                            (float *)task->storage.transform.device_pointer,
+                                            (int *)task->storage.rank.device_pointer,
+                                            &task->rect.x,
+                                            task->buffer.pass_stride,
+                                            task->buffer.frame_stride,
+                                            task->buffer.use_time,
+                                            task->radius,
+                                            task->pca_threshold);
+      }
+    }
+    return true;
+  }
+
+  bool denoising_accumulate(device_ptr color_ptr,
+                            device_ptr color_variance_ptr,
+                            device_ptr scale_ptr,
+                            int frame,
+                            DenoisingTask *task)
+  {
+    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
+
+    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
+    float *difference = temporary_mem;
+    float *blurDifference = temporary_mem + task->buffer.pass_stride;
+
+    int r = task->radius;
+    int frame_offset = frame * task->buffer.frame_stride;
+    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
+      int dy = i / (2 * r + 1) - r;
+      int dx = i % (2 * r + 1) - r;
+
+      int local_rect[4] = {max(0, -dx),
+                           max(0, -dy),
+                           task->reconstruction_state.source_w - max(0, dx),
+                           task->reconstruction_state.source_h - max(0, dy)};
+      filter_nlm_calc_difference_kernel()(dx,
+                                          dy,
+                                          (float *)color_ptr,
+                                          (float *)color_variance_ptr,
+                                          (float *)scale_ptr,
+                                          difference,
+                                          local_rect,
+                                          task->buffer.stride,
+                                          task->buffer.pass_stride,
+                                          frame_offset,
+                                          1.0f,
+                                          task->nlm_k_2);
+      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
+      filter_nlm_calc_weight_kernel()(
+          blurDifference, difference, local_rect, task->buffer.stride, 4);
+      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
+      filter_nlm_construct_gramian_kernel()(dx,
+                                            dy,
+                                            task->tile_info->frames[frame],
+                                            blurDifference,
+                                            (float *)task->buffer.mem.device_pointer,
+                                            (float *)task->storage.transform.device_pointer,
+                                            (int *)task->storage.rank.device_pointer,
+                                            (float *)task->storage.XtWX.device_pointer,
+                                            (float3 *)task->storage.XtWY.device_pointer,
+                                            local_rect,
+                                            &task->reconstruction_state.filter_window.x,
+                                            task->buffer.stride,
+                                            4,
+                                            task->buffer.pass_stride,
+                                            frame_offset,
+                                            task->buffer.use_time);
+    }
+
+    return true;
+  }
+
+  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
+  {
+    for (int y = 0; y < task->filter_area.w; y++) {
+      for (int x = 0; x < task->filter_area.z; x++) {
+        filter_finalize_kernel()(x,
+                                 y,
+                                 y * task->filter_area.z + x,
+                                 (float *)output_ptr,
+                                 (int *)task->storage.rank.device_pointer,
+                                 (float *)task->storage.XtWX.device_pointer,
+                                 (float3 *)task->storage.XtWY.device_pointer,
+                                 &task->reconstruction_state.buffer_params.x,
+                                 task->render_buffer.samples);
+      }
+    }
+    return true;
+  }
+
+  bool denoising_combine_halves(device_ptr a_ptr,
+                                device_ptr b_ptr,
+                                device_ptr mean_ptr,
+                                device_ptr variance_ptr,
+                                int r,
+                                int4 rect,
+                                DenoisingTask *task)
+  {
+    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
+
+    for (int y = rect.y; y < rect.w; y++) {
+      for (int x = rect.x; x < rect.z; x++) {
+        filter_combine_halves_kernel()(x,
+                                       y,
+                                       (float *)mean_ptr,
+                                       (float *)variance_ptr,
+                                       (float *)a_ptr,
+                                       (float *)b_ptr,
+                                       &rect.x,
+                                       r);
+      }
+    }
+    return true;
+  }
+
+  bool denoising_divide_shadow(device_ptr a_ptr,
+                               device_ptr b_ptr,
+                               device_ptr sample_variance_ptr,
+                               device_ptr sv_variance_ptr,
+                               device_ptr buffer_variance_ptr,
+                               DenoisingTask *task)
+  {
+    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
+
+    for (int y = task->rect.y; y < task->rect.w; y++) {
+      for (int x = task->rect.x; x < task->rect.z; x++) {
+        filter_divide_shadow_kernel()(task->render_buffer.samples,
+                                      task->tile_info,
+                                      x,
+                                      y,
+                                      (float *)a_ptr,
+                                      (float *)b_ptr,
+                                      (float *)sample_variance_ptr,
+                                      (float *)sv_variance_ptr,
+                                      (float *)buffer_variance_ptr,
+                                      &task->rect.x,
+                                      task->render_buffer.pass_stride,
+                                      task->render_buffer.offset);
+      }
+    }
+    return true;
+  }
+
+  bool denoising_get_feature(int mean_offset,
+                             int variance_offset,
+                             device_ptr mean_ptr,
+                             device_ptr variance_ptr,
+                             float scale,
+                             DenoisingTask *task)
+  {
+    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
+
+    for (int y = task->rect.y; y < task->rect.w; y++) {
+      for (int x = task->rect.x; x < task->rect.z; x++) {
+        filter_get_feature_kernel()(task->render_buffer.samples,
+                                    task->tile_info,
+                                    mean_offset,
+                                    variance_offset,
+                                    x,
+                                    y,
+                                    (float *)mean_ptr,
+                                    (float *)variance_ptr,
+                                    scale,
+                                    &task->rect.x,
+                                    task->render_buffer.pass_stride,
+                                    task->render_buffer.offset);
+      }
+    }
+    return true;
+  }
+
+  bool denoising_write_feature(int out_offset,
+                               device_ptr from_ptr,
+                               device_ptr buffer_ptr,
+                               DenoisingTask *task)
+  {
+    for (int y = 0; y < task->filter_area.w; y++) {
+      for (int x = 0; x < task->filter_area.z; x++) {
+        filter_write_feature_kernel()(task->render_buffer.samples,
+                                      x + task->filter_area.x,
+                                      y + task->filter_area.y,
+                                      &task->reconstruction_state.buffer_params.x,
+                                      (float *)from_ptr,
+                                      (float *)buffer_ptr,
+                                      out_offset,
+                                      &task->rect.x);
+      }
+    }
+    return true;
+  }
+
+  bool denoising_detect_outliers(device_ptr image_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr depth_ptr,
+                                 device_ptr output_ptr,
+                                 DenoisingTask *task)
+  {
+    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
+
+    for (int y = task->rect.y; y < task->rect.w; y++) {
+      for (int x = task->rect.x; x < task->rect.z; x++) {
+        filter_detect_outliers_kernel()(x,
+                                        y,
+                                        (float *)image_ptr,
+                                        (float *)variance_ptr,
+                                        (float *)depth_ptr,
+                                        (float *)output_ptr,
+                                        &task->rect.x,
+                                        task->buffer.pass_stride);
+      }
+    }
+    return true;
+  }
+
+  void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+  {
+    const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
+
+    scoped_timer timer(&tile.buffers->render_time);
+
+    Coverage coverage(kg, tile);
+    if (use_coverage) {
+      coverage.init_path_trace();
+    }
+
+    float *render_buffer = (float *)tile.buffer;
+    int start_sample = tile.start_sample;
+    int end_sample = tile.start_sample + tile.num_samples;
+
+    /* Needed for Embree. */
+    SIMD_SET_FLUSH_TO_ZERO;
+
+    for (int sample = start_sample; sample < end_sample; sample++) {
+      if (task.get_cancel() || task_pool.canceled()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          if (use_coverage) {
+            coverage.init_pixel(x, y);
+          }
+          path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+
+      tile.sample = sample + 1;
+
+      task.update_progress(&tile, tile.w * tile.h);
+    }
+    if (use_coverage) {
+      coverage.finalize();
+    }
+  }
+
+  void denoise(DenoisingTask &denoising, RenderTile &tile)
+  {
+    ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
+
+    tile.sample = tile.start_sample + tile.num_samples;
+
+    denoising.functions.construct_transform = function_bind(
+        &CPUDevice::denoising_construct_transform, this, &denoising);
+    denoising.functions.accumulate = function_bind(
+        &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+    denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
+    denoising.functions.divide_shadow = function_bind(
+        &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+    denoising.functions.non_local_means = function_bind(
+        &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+    denoising.functions.combine_halves = function_bind(
+        &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+    denoising.functions.get_feature = function_bind(
+        &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+    denoising.functions.write_feature = function_bind(
+        &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
+    denoising.functions.detect_outliers = function_bind(
+        &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+    denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
+    denoising.render_buffer.samples = tile.sample;
+    denoising.buffer.gpu_temporary_mem = false;
+
+    denoising.run_denoising(&tile);
+  }
+
+  void thread_render(DeviceTask &task)
+  {
+    if (task_pool.canceled()) {
+      if (task.need_finish_queue == false)
+        return;
+    }
+
+    /* allocate buffer for kernel globals */
+    device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
+    kgbuffer.alloc_to_device(1);
+
+    KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
+        KernelGlobals(thread_kernel_globals_init());
+
+    profiler.add_state(&kg->profiler);
+
+    CPUSplitKernel *split_kernel = NULL;
+    if (use_split_kernel) {
+      split_kernel = new CPUSplitKernel(this);
+      if (!split_kernel->load_kernels(requested_features)) {
+        thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
+        kgbuffer.free();
+        delete split_kernel;
+        return;
+      }
+    }
+
+    RenderTile tile;
+    DenoisingTask denoising(this, task);
+    denoising.profiler = &kg->profiler;
+
+    while (task.acquire_tile(this, tile)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        if (use_split_kernel) {
+          device_only_memory<uchar> void_buffer(this, "void_buffer");
+          split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
+        }
+        else {
+          path_trace(task, tile, kg);
+        }
+      }
+      else if (tile.task == RenderTile::DENOISE) {
+        denoise(denoising, tile);
+        task.update_progress(&tile, tile.w * tile.h);
+      }
+
+      task.release_tile(tile);
+
+      if (task_pool.canceled()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+
+    profiler.remove_state(&kg->profiler);
+
+    thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
+    kg->~KernelGlobals();
+    kgbuffer.free();
+    delete split_kernel;
+  }
+
+  void thread_film_convert(DeviceTask &task)
+  {
+    float sample_scale = 1.0f / (task.sample + 1);
+
+    if (task.rgba_half) {
+      for (int y = task.y; y < task.y + task.h; y++)
+        for (int x = task.x; x < task.x + task.w; x++)
+          convert_to_half_float_kernel()(&kernel_globals,
+                                         (uchar4 *)task.rgba_half,
+                                         (float *)task.buffer,
+                                         sample_scale,
+                                         x,
+                                         y,
+                                         task.offset,
+                                         task.stride);
+    }
+    else {
+      for (int y = task.y; y < task.y + task.h; y++)
+        for (int x = task.x; x < task.x + task.w; x++)
+          convert_to_byte_kernel()(&kernel_globals,
+                                   (uchar4 *)task.rgba_byte,
+                                   (float *)task.buffer,
+                                   sample_scale,
+                                   x,
+                                   y,
+                                   task.offset,
+                                   task.stride);
+    }
+  }
+
+  void thread_shader(DeviceTask &task)
+  {
+    KernelGlobals kg = kernel_globals;
 
 #ifdef WITH_OSL
-		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
+    OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
-		for(int sample = 0; sample < task.num_samples; sample++) {
-			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-				shader_kernel()(&kg,
-				                (uint4*)task.shader_input,
-				                (float4*)task.shader_output,
-				                task.shader_eval_type,
-				                task.shader_filter,
-				                x,
-				                task.offset,
-				                sample);
-
-			if(task.get_cancel() || task_pool.canceled())
-				break;
-
-			task.update_progress(NULL);
-
-		}
+    for (int sample = 0; sample < task.num_samples; sample++) {
+      for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
+        shader_kernel()(&kg,
+                        (uint4 *)task.shader_input,
+                        (float4 *)task.shader_output,
+                        task.shader_eval_type,
+                        task.shader_filter,
+                        x,
+                        task.offset,
+                        sample);
+
+      if (task.get_cancel() || task_pool.canceled())
+        break;
+
+      task.update_progress(NULL);
+    }
 
 #ifdef WITH_OSL
-		OSLShader::thread_free(&kg);
+    OSLShader::thread_free(&kg);
 #endif
-	}
-
-	int get_split_task_count(DeviceTask& task)
-	{
-		if(task.type == DeviceTask::SHADER)
-			return task.get_subtask_count(info.cpu_threads, 256);
-		else
-			return task.get_subtask_count(info.cpu_threads);
-	}
-
-	void task_add(DeviceTask& task)
-	{
-		/* Load texture info. */
-		load_texture_info();
-
-		/* split task into smaller ones */
-		list<DeviceTask> tasks;
-
-		if(task.type == DeviceTask::SHADER)
-			task.split(tasks, info.cpu_threads, 256);
-		else
-			task.split(tasks, info.cpu_threads);
-
-		foreach(DeviceTask& task, tasks)
-			task_pool.push(new CPUDeviceTask(this, task));
-	}
-
-	void task_wait()
-	{
-		task_pool.wait_work();
-	}
-
-	void task_cancel()
-	{
-		task_pool.cancel();
-	}
-
-protected:
-	inline KernelGlobals thread_kernel_globals_init()
-	{
-		KernelGlobals kg = kernel_globals;
-		kg.transparent_shadow_intersections = NULL;
-		const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
-		                            sizeof(*kg.decoupled_volume_steps);
-		for(int i = 0; i < decoupled_count; ++i) {
-			kg.decoupled_volume_steps[i] = NULL;
-		}
-		kg.decoupled_volume_steps_index = 0;
-		kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
+  }
+
+  int get_split_task_count(DeviceTask &task)
+  {
+    if (task.type == DeviceTask::SHADER)
+      return task.get_subtask_count(info.cpu_threads, 256);
+    else
+      return task.get_subtask_count(info.cpu_threads);
+  }
+
+  void task_add(DeviceTask &task)
+  {
+    /* Load texture info. */
+    load_texture_info();
+
+    /* split task into smaller ones */
+    list<DeviceTask> tasks;
+
+    if (task.type == DeviceTask::SHADER)
+      task.split(tasks, info.cpu_threads, 256);
+    else
+      task.split(tasks, info.cpu_threads);
+
+    foreach (DeviceTask &task, tasks)
+      task_pool.push(new CPUDeviceTask(this, task));
+  }
+
+  void task_wait()
+  {
+    task_pool.wait_work();
+  }
+
+  void task_cancel()
+  {
+    task_pool.cancel();
+  }
+
+ protected:
+  inline KernelGlobals thread_kernel_globals_init()
+  {
+    KernelGlobals kg = kernel_globals;
+    kg.transparent_shadow_intersections = NULL;
+    const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
+                                sizeof(*kg.decoupled_volume_steps);
+    for (int i = 0; i < decoupled_count; ++i) {
+      kg.decoupled_volume_steps[i] = NULL;
+    }
+    kg.decoupled_volume_steps_index = 0;
+    kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
 #ifdef WITH_OSL
-		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
+    OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
-		return kg;
-	}
-
-	inline void thread_kernel_globals_free(KernelGlobals *kg)
-	{
-		if(kg == NULL) {
-			return;
-		}
-
-		if(kg->transparent_shadow_intersections != NULL) {
-			free(kg->transparent_shadow_intersections);
-		}
-		const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
-		                            sizeof(*kg->decoupled_volume_steps);
-		for(int i = 0; i < decoupled_count; ++i) {
-			if(kg->decoupled_volume_steps[i] != NULL) {
-				free(kg->decoupled_volume_steps[i]);
-			}
-		}
+    return kg;
+  }
+
+  inline void thread_kernel_globals_free(KernelGlobals *kg)
+  {
+    if (kg == NULL) {
+      return;
+    }
+
+    if (kg->transparent_shadow_intersections != NULL) {
+      free(kg->transparent_shadow_intersections);
+    }
+    const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
+                                sizeof(*kg->decoupled_volume_steps);
+    for (int i = 0; i < decoupled_count; ++i) {
+      if (kg->decoupled_volume_steps[i] != NULL) {
+        free(kg->decoupled_volume_steps[i]);
+      }
+    }
 #ifdef WITH_OSL
-		OSLShader::thread_free(kg);
+    OSLShader::thread_free(kg);
 #endif
-	}
+  }
 
-	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features_) {
-		requested_features = requested_features_;
+  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_)
+  {
+    requested_features = requested_features_;
 
-		return true;
-	}
+    return true;
+  }
 };
 
 /* split kernel */
 
 class CPUSplitKernelFunction : public SplitKernelFunction {
-public:
-	CPUDevice* device;
-	void (*func)(KernelGlobals *kg, KernelData *data);
-
-	CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
-	~CPUSplitKernelFunction() {}
-
-	virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
-	{
-		if(!func) {
-			return false;
-		}
-
-		KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
-		kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-		for(int y = 0; y < dim.global_size[1]; y++) {
-			for(int x = 0; x < dim.global_size[0]; x++) {
-				kg->global_id = make_int2(x, y);
-
-				func(kg, (KernelData*)data.device_pointer);
-			}
-		}
-
-		return true;
-	}
+ public:
+  CPUDevice *device;
+  void (*func)(KernelGlobals *kg, KernelData *data);
+
+  CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL)
+  {
+  }
+  ~CPUSplitKernelFunction()
+  {
+  }
+
+  virtual bool enqueue(const KernelDimensions &dim,
+                       device_memory &kernel_globals,
+                       device_memory &data)
+  {
+    if (!func) {
+      return false;
+    }
+
+    KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
+    kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+    for (int y = 0; y < dim.global_size[1]; y++) {
+      for (int x = 0; x < dim.global_size[0]; x++) {
+        kg->global_id = make_int2(x, y);
+
+        func(kg, (KernelData *)data.device_pointer);
+      }
+    }
+
+    return true;
+  }
 };
 
 CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
 {
 }
 
-bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
-                                                    RenderTile& rtile,
+bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                                    RenderTile &rtile,
                                                     int num_global_elements,
-                                                    device_memory& kernel_globals,
-                                                    device_memory& data,
-                                                    device_memory& split_data,
-                                                    device_memory& ray_state,
-                                                    device_memory& queue_index,
-                                                    device_memory& use_queues_flags,
-                                                    device_memory& work_pool_wgs)
+                                                    device_memory &kernel_globals,
+                                                    device_memory &data,
+                                                    device_memory &split_data,
+                                                    device_memory &ray_state,
+                                                    device_memory &queue_index,
+                                                    device_memory &use_queues_flags,
+                                                    device_memory &work_pool_wgs)
 {
-	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
-	kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-	for(int y = 0; y < dim.global_size[1]; y++) {
-		for(int x = 0; x < dim.global_size[0]; x++) {
-			kg->global_id = make_int2(x, y);
-
-			device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
-			                           (KernelData*)data.device_pointer,
-			                           (void*)split_data.device_pointer,
-			                           num_global_elements,
-			                           (char*)ray_state.device_pointer,
-			                           rtile.start_sample,
-			                           rtile.start_sample + rtile.num_samples,
-			                           rtile.x,
-			                           rtile.y,
-			                           rtile.w,
-			                           rtile.h,
-			                           rtile.offset,
-			                           rtile.stride,
-			                           (int*)queue_index.device_pointer,
-			                           dim.global_size[0] * dim.global_size[1],
-			                           (char*)use_queues_flags.device_pointer,
-			                           (uint*)work_pool_wgs.device_pointer,
-			                           rtile.num_samples,
-			                           (float*)rtile.buffer);
-		}
-	}
-
-	return true;
+  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
+  kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+  for (int y = 0; y < dim.global_size[1]; y++) {
+    for (int x = 0; x < dim.global_size[0]; x++) {
+      kg->global_id = make_int2(x, y);
+
+      device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
+                                 (KernelData *)data.device_pointer,
+                                 (void *)split_data.device_pointer,
+                                 num_global_elements,
+                                 (char *)ray_state.device_pointer,
+                                 rtile.start_sample,
+                                 rtile.start_sample + rtile.num_samples,
+                                 rtile.x,
+                                 rtile.y,
+                                 rtile.w,
+                                 rtile.h,
+                                 rtile.offset,
+                                 rtile.stride,
+                                 (int *)queue_index.device_pointer,
+                                 dim.global_size[0] * dim.global_size[1],
+                                 (char *)use_queues_flags.device_pointer,
+                                 (uint *)work_pool_wgs.device_pointer,
+                                 rtile.num_samples,
+                                 (float *)rtile.buffer);
+    }
+  }
+
+  return true;
 }
 
-SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name,
-                                                               const DeviceRequestedFeatures&)
+SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name,
+                                                               const DeviceRequestedFeatures &)
 {
-	CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
+  CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
 
-	kernel->func = device->split_kernels[kernel_name]();
-	if(!kernel->func) {
-		delete kernel;
-		return NULL;
-	}
+  kernel->func = device->split_kernels[kernel_name]();
+  if (!kernel->func) {
+    delete kernel;
+    return NULL;
+  }
 
-	return kernel;
+  return kernel;
 }
 
 int2 CPUSplitKernel::split_kernel_local_size()
 {
-	return make_int2(1, 1);
+  return make_int2(1, 1);
 }
 
-int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
-	return make_int2(1, 1);
+int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
+                                              device_memory & /*data*/,
+                                              DeviceTask * /*task*/)
+{
+  return make_int2(1, 1);
 }
 
-uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
-	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals,
+                                           device_memory & /*data*/,
+                                           size_t num_threads)
+{
+  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
 
-	return split_data_buffer_size(kg, num_threads);
+  return split_data_buffer_size(kg, num_threads);
 }
 
-Device *device_cpu_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
 {
-	return new CPUDevice(info, stats, profiler, background);
+  return new CPUDevice(info, stats, profiler, background);
 }
 
-void device_cpu_info(vector<DeviceInfo>& devices)
+void device_cpu_info(vector<DeviceInfo> &devices)
 {
-	DeviceInfo info;
-
-	info.type = DEVICE_CPU;
-	info.description = system_cpu_brand_string();
-	info.id = "CPU";
-	info.num = 0;
-	info.has_volume_decoupled = true;
-	info.has_osl = true;
-	info.has_half_images = true;
-	info.has_profiling = true;
-
-	devices.insert(devices.begin(), info);
+  DeviceInfo info;
+
+  info.type = DEVICE_CPU;
+  info.description = system_cpu_brand_string();
+  info.id = "CPU";
+  info.num = 0;
+  info.has_volume_decoupled = true;
+  info.has_osl = true;
+  info.has_half_images = true;
+  info.has_profiling = true;
+
+  devices.insert(devices.begin(), info);
 }
 
 string device_cpu_capabilities()
 {
-	string capabilities = "";
-	capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
-	capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
-	capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
-	capabilities += system_cpu_support_avx() ? "AVX " : "";
-	capabilities += system_cpu_support_avx2() ? "AVX2" : "";
-	if(capabilities[capabilities.size() - 1] == ' ')
-		capabilities.resize(capabilities.size() - 1);
-	return capabilities;
+  string capabilities = "";
+  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
+  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
+  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+  capabilities += system_cpu_support_avx() ? "AVX " : "";
+  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
+  if (capabilities[capabilities.size() - 1] == ' ')
+    capabilities.resize(capabilities.size() - 1);
+  return capabilities;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 3aa6bce155e..68bc3bd4045 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -62,2144 +62,2242 @@ namespace {
 
 const char *cuewErrorString(CUresult result)
 {
-	/* We can only give error code here without major code duplication, that
-	 * should be enough since dynamic loading is only being disabled by folks
-	 * who knows what they're doing anyway.
-	 *
-	 * NOTE: Avoid call from several threads.
-	 */
-	static string error;
-	error = string_printf("%d", result);
-	return error.c_str();
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
 }
 
 const char *cuewCompilerPath()
 {
-	return CYCLES_CUDA_NVCC_EXECUTABLE;
+  return CYCLES_CUDA_NVCC_EXECUTABLE;
 }
 
 int cuewCompilerVersion()
 {
-	return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
 }
 
-}  /* namespace */
-#endif  /* WITH_CUDA_DYNLOAD */
+} /* namespace */
+#endif /* WITH_CUDA_DYNLOAD */
 
 class CUDADevice;
 
 class CUDASplitKernel : public DeviceSplitKernel {
-	CUDADevice *device;
-public:
-	explicit CUDASplitKernel(CUDADevice *device);
-
-	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-
-	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
-	                                            RenderTile& rtile,
-	                                            int num_global_elements,
-	                                            device_memory& kernel_globals,
-	                                            device_memory& kernel_data_,
-	                                            device_memory& split_data,
-	                                            device_memory& ray_state,
-	                                            device_memory& queue_index,
-	                                            device_memory& use_queues_flag,
-	                                            device_memory& work_pool_wgs);
-
-	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
-	                                                       const DeviceRequestedFeatures&);
-	virtual int2 split_kernel_local_size();
-	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+  CUDADevice *device;
+
+ public:
+  explicit CUDASplitKernel(CUDADevice *device);
+
+  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
+
+  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                              RenderTile &rtile,
+                                              int num_global_elements,
+                                              device_memory &kernel_globals,
+                                              device_memory &kernel_data_,
+                                              device_memory &split_data,
+                                              device_memory &ray_state,
+                                              device_memory &queue_index,
+                                              device_memory &use_queues_flag,
+                                              device_memory &work_pool_wgs);
+
+  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
+                                                         const DeviceRequestedFeatures &);
+  virtual int2 split_kernel_local_size();
+  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
 };
 
 /* Utility to push/pop CUDA context. */
 class CUDAContextScope {
-public:
-	CUDAContextScope(CUDADevice *device);
-	~CUDAContextScope();
+ public:
+  CUDAContextScope(CUDADevice *device);
+  ~CUDAContextScope();
 
-private:
-	CUDADevice *device;
+ private:
+  CUDADevice *device;
 };
 
-class CUDADevice : public Device
-{
-public:
-	DedicatedTaskPool task_pool;
-	CUdevice cuDevice;
-	CUcontext cuContext;
-	CUmodule cuModule, cuFilterModule;
-	size_t device_texture_headroom;
-	size_t device_working_headroom;
-	bool move_texture_to_host;
-	size_t map_host_used;
-	size_t map_host_limit;
-	int can_map_host;
-	int cuDevId;
-	int cuDevArchitecture;
-	bool first_error;
-	CUDASplitKernel *split_kernel;
-
-	struct CUDAMem {
-		CUDAMem()
-		: texobject(0), array(0), map_host_pointer(0), free_map_host(false) {}
-
-		CUtexObject texobject;
-		CUarray array;
-		void *map_host_pointer;
-		bool free_map_host;
-	};
-	typedef map<device_memory*, CUDAMem> CUDAMemMap;
-	CUDAMemMap cuda_mem_map;
-
-	struct PixelMem {
-		GLuint cuPBO;
-		CUgraphicsResource cuPBOresource;
-		GLuint cuTexId;
-		int w, h;
-	};
-	map<device_ptr, PixelMem> pixel_mem_map;
-
-	/* Bindless Textures */
-	device_vector<TextureInfo> texture_info;
-	bool need_texture_info;
-
-	CUdeviceptr cuda_device_ptr(device_ptr mem)
-	{
-		return (CUdeviceptr)mem;
-	}
-
-	static bool have_precompiled_kernels()
-	{
-		string cubins_path = path_get("lib");
-		return path_exists(cubins_path);
-	}
-
-	virtual bool show_samples() const
-	{
-		/* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-		return true;
-	}
-
-	virtual BVHLayoutMask get_bvh_layout_mask() const {
-		return BVH_LAYOUT_BVH2;
-	}
-
-/*#ifdef NDEBUG
+class CUDADevice : public Device {
+ public:
+  DedicatedTaskPool task_pool;
+  CUdevice cuDevice;
+  CUcontext cuContext;
+  CUmodule cuModule, cuFilterModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int cuDevId;
+  int cuDevArchitecture;
+  bool first_error;
+  CUDASplitKernel *split_kernel;
+
+  struct CUDAMem {
+    CUDAMem() : texobject(0), array(0), map_host_pointer(0), free_map_host(false)
+    {
+    }
+
+    CUtexObject texobject;
+    CUarray array;
+    void *map_host_pointer;
+    bool free_map_host;
+  };
+  typedef map<device_memory *, CUDAMem> CUDAMemMap;
+  CUDAMemMap cuda_mem_map;
+
+  struct PixelMem {
+    GLuint cuPBO;
+    CUgraphicsResource cuPBOresource;
+    GLuint cuTexId;
+    int w, h;
+  };
+  map<device_ptr, PixelMem> pixel_mem_map;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  CUdeviceptr cuda_device_ptr(device_ptr mem)
+  {
+    return (CUdeviceptr)mem;
+  }
+
+  static bool have_precompiled_kernels()
+  {
+    string cubins_path = path_get("lib");
+    return path_exists(cubins_path);
+  }
+
+  virtual bool show_samples() const
+  {
+    /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+    return true;
+  }
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const
+  {
+    return BVH_LAYOUT_BVH2;
+  }
+
+  /*#ifdef NDEBUG
 #define cuda_abort()
 #else
 #define cuda_abort() abort()
 #endif*/
-	void cuda_error_documentation()
-	{
-		if(first_error) {
-			fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-			fprintf(stderr, "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
-			first_error = false;
-		}
-	}
+  void cuda_error_documentation()
+  {
+    if (first_error) {
+      fprintf(stderr,
+              "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+      fprintf(stderr,
+              "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
+      first_error = false;
+    }
+  }
 
 #define cuda_assert(stmt) \
-	{ \
-		CUresult result = stmt; \
-		\
-		if(result != CUDA_SUCCESS) { \
-			string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
-			if(error_msg == "") \
-				error_msg = message; \
-			fprintf(stderr, "%s\n", message.c_str()); \
-			/*cuda_abort();*/ \
-			cuda_error_documentation(); \
-		} \
-	} (void) 0
-
-	bool cuda_error_(CUresult result, const string& stmt)
-	{
-		if(result == CUDA_SUCCESS)
-			return false;
-
-		string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
-		if(error_msg == "")
-			error_msg = message;
-		fprintf(stderr, "%s\n", message.c_str());
-		cuda_error_documentation();
-		return true;
-	}
+  { \
+    CUresult result = stmt; \
+\
+    if (result != CUDA_SUCCESS) { \
+      string message = string_printf( \
+          "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
+      if (error_msg == "") \
+        error_msg = message; \
+      fprintf(stderr, "%s\n", message.c_str()); \
+      /*cuda_abort();*/ \
+      cuda_error_documentation(); \
+    } \
+  } \
+  (void)0
+
+  bool cuda_error_(CUresult result, const string &stmt)
+  {
+    if (result == CUDA_SUCCESS)
+      return false;
+
+    string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
+    if (error_msg == "")
+      error_msg = message;
+    fprintf(stderr, "%s\n", message.c_str());
+    cuda_error_documentation();
+    return true;
+  }
 
 #define cuda_error(stmt) cuda_error_(stmt, #stmt)
 
-	void cuda_error_message(const string& message)
-	{
-		if(error_msg == "")
-			error_msg = message;
-		fprintf(stderr, "%s\n", message.c_str());
-		cuda_error_documentation();
-	}
-
-	CUDADevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_)
-	: Device(info, stats, profiler, background_),
-	  texture_info(this, "__texture_info", MEM_TEXTURE)
-	{
-		first_error = true;
-		background = background_;
-
-		cuDevId = info.num;
-		cuDevice = 0;
-		cuContext = 0;
-
-		cuModule = 0;
-		cuFilterModule = 0;
-
-		split_kernel = NULL;
-
-		need_texture_info = false;
-
-		device_texture_headroom = 0;
-		device_working_headroom = 0;
-		move_texture_to_host = false;
-		map_host_limit = 0;
-		map_host_used = 0;
-		can_map_host = 0;
-
-		/* Intialize CUDA. */
-		if(cuda_error(cuInit(0)))
-			return;
-
-		/* Setup device and context. */
-		if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
-			return;
-
-		/* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
-		 * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
-		 * so we can predict which memory to map to host. */
-		cuda_assert(cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
-		unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-		if(can_map_host) {
-			ctx_flags |= CU_CTX_MAP_HOST;
-			init_host_memory();
-		}
-
-		/* Create context. */
-		CUresult result;
-
-		if(background) {
-			result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-		}
-		else {
-			result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
-			if(result != CUDA_SUCCESS) {
-				result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-				background = true;
-			}
-		}
-
-		if(cuda_error_(result, "cuCtxCreate"))
-			return;
-
-		int major, minor;
-		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-		cuDevArchitecture = major*100 + minor*10;
-
-		/* Pop context set by cuCtxCreate. */
-		cuCtxPopCurrent(NULL);
-	}
-
-	~CUDADevice()
-	{
-		task_pool.stop();
-
-		delete split_kernel;
-
-		texture_info.free();
-
-		cuda_assert(cuCtxDestroy(cuContext));
-	}
-
-	bool support_device(const DeviceRequestedFeatures& /*requested_features*/)
-	{
-		int major, minor;
-		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-		/* We only support sm_30 and above */
-		if(major < 3) {
-			cuda_error_message(string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.", major, minor));
-			return false;
-		}
-
-		return true;
-	}
-
-	bool use_adaptive_compilation()
-	{
-		return DebugFlags().cuda.adaptive_compile;
-	}
-
-	bool use_split_kernel()
-	{
-		return DebugFlags().cuda.split_kernel;
-	}
-
-	/* Common NVCC flags which stays the same regardless of shading model,
-	 * kernel sources md5 and only depends on compiler or compilation settings.
-	 */
-	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features,
-	        bool filter=false, bool split=false)
-	{
-		const int machine = system_cpu_bits();
-		const string source_path = path_get("source");
-		const string include_path = source_path;
-		string cflags = string_printf("-m%d "
-		                              "--ptxas-options=\"-v\" "
-		                              "--use_fast_math "
-		                              "-DNVCC "
-		                               "-I\"%s\"",
-		                              machine,
-		                              include_path.c_str());
-		if(!filter && use_adaptive_compilation()) {
-			cflags += " " + requested_features.get_build_options();
-		}
-		const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-		if(extra_cflags) {
-			cflags += string(" ") + string(extra_cflags);
-		}
+  void cuda_error_message(const string &message)
+  {
+    if (error_msg == "")
+      error_msg = message;
+    fprintf(stderr, "%s\n", message.c_str());
+    cuda_error_documentation();
+  }
+
+  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
+      : Device(info, stats, profiler, background_),
+        texture_info(this, "__texture_info", MEM_TEXTURE)
+  {
+    first_error = true;
+    background = background_;
+
+    cuDevId = info.num;
+    cuDevice = 0;
+    cuContext = 0;
+
+    cuModule = 0;
+    cuFilterModule = 0;
+
+    split_kernel = NULL;
+
+    need_texture_info = false;
+
+    device_texture_headroom = 0;
+    device_working_headroom = 0;
+    move_texture_to_host = false;
+    map_host_limit = 0;
+    map_host_used = 0;
+    can_map_host = 0;
+
+    /* Intialize CUDA. */
+    if (cuda_error(cuInit(0)))
+      return;
+
+    /* Setup device and context. */
+    if (cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
+      return;
+
+    /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+     * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+     * so we can predict which memory to map to host. */
+    cuda_assert(
+        cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+    unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+    if (can_map_host) {
+      ctx_flags |= CU_CTX_MAP_HOST;
+      init_host_memory();
+    }
+
+    /* Create context. */
+    CUresult result;
+
+    if (background) {
+      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+    }
+    else {
+      result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+      if (result != CUDA_SUCCESS) {
+        result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+        background = true;
+      }
+    }
+
+    if (cuda_error_(result, "cuCtxCreate"))
+      return;
+
+    int major, minor;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+    cuDevArchitecture = major * 100 + minor * 10;
+
+    /* Pop context set by cuCtxCreate. */
+    cuCtxPopCurrent(NULL);
+  }
+
+  ~CUDADevice()
+  {
+    task_pool.stop();
+
+    delete split_kernel;
+
+    texture_info.free();
+
+    cuda_assert(cuCtxDestroy(cuContext));
+  }
+
+  bool support_device(const DeviceRequestedFeatures & /*requested_features*/)
+  {
+    int major, minor;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+    /* We only support sm_30 and above */
+    if (major < 3) {
+      cuda_error_message(string_printf(
+          "CUDA device supported only with compute capability 3.0 or up, found %d.%d.",
+          major,
+          minor));
+      return false;
+    }
+
+    return true;
+  }
+
+  bool use_adaptive_compilation()
+  {
+    return DebugFlags().cuda.adaptive_compile;
+  }
+
+  bool use_split_kernel()
+  {
+    return DebugFlags().cuda.split_kernel;
+  }
+
+  /* Common NVCC flags which stays the same regardless of shading model,
+   * kernel sources md5 and only depends on compiler or compilation settings.
+   */
+  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
+                                          bool filter = false,
+                                          bool split = false)
+  {
+    const int machine = system_cpu_bits();
+    const string source_path = path_get("source");
+    const string include_path = source_path;
+    string cflags = string_printf(
+        "-m%d "
+        "--ptxas-options=\"-v\" "
+        "--use_fast_math "
+        "-DNVCC "
+        "-I\"%s\"",
+        machine,
+        include_path.c_str());
+    if (!filter && use_adaptive_compilation()) {
+      cflags += " " + requested_features.get_build_options();
+    }
+    const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+    if (extra_cflags) {
+      cflags += string(" ") + string(extra_cflags);
+    }
 #ifdef WITH_CYCLES_DEBUG
-		cflags += " -D__KERNEL_DEBUG__";
+    cflags += " -D__KERNEL_DEBUG__";
 #endif
 
-		if(split) {
-			cflags += " -D__SPLIT__";
-		}
-
-		return cflags;
-	}
-
-	bool compile_check_compiler() {
-		const char *nvcc = cuewCompilerPath();
-		if(nvcc == NULL) {
-			cuda_error_message("CUDA nvcc compiler not found. "
-			                   "Install CUDA toolkit in default location.");
-			return false;
-		}
-		const int cuda_version = cuewCompilerVersion();
-		VLOG(1) << "Found nvcc " << nvcc
-		        << ", CUDA version " << cuda_version
-		        << ".";
-		const int major = cuda_version / 10, minor = cuda_version % 10;
-		if(cuda_version == 0) {
-			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
-			return false;
-		}
-		if(cuda_version < 80) {
-			printf("Unsupported CUDA version %d.%d detected, "
-			       "you need CUDA 8.0 or newer.\n",
-			       major, minor);
-			return false;
-		}
-		else if(cuda_version != 101) {
-			printf("CUDA version %d.%d detected, build may succeed but only "
-			       "CUDA 10.1 is officially supported.\n",
-			       major, minor);
-		}
-		return true;
-	}
-
-	string compile_kernel(const DeviceRequestedFeatures& requested_features,
-	                      bool filter=false, bool split=false)
-	{
-		const char *name, *source;
-		if(filter) {
-			name = "filter";
-			source = "filter.cu";
-		}
-		else if(split) {
-			name = "kernel_split";
-			source = "kernel_split.cu";
-		}
-		else {
-			name = "kernel";
-			source = "kernel.cu";
-		}
-		/* Compute cubin name. */
-		int major, minor;
-		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-		/* Attempt to use kernel provided with Blender. */
-		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
-			                                            name, major, minor));
-			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
-			if(path_exists(cubin)) {
-				VLOG(1) << "Using precompiled kernel.";
-				return cubin;
-			}
-		}
-
-		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features, filter, split);
-
-		/* Try to use locally compiled kernel. */
-		const string source_path = path_get("source");
-		const string kernel_md5 = path_files_md5_hash(source_path);
-
-		/* We include cflags into md5 so changing cuda toolkit or changing other
-		 * compiler command line arguments makes sure cubin gets re-built.
-		 */
-		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
-
-		const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
-		                                        name, major, minor,
-		                                        cubin_md5.c_str());
-		const string cubin = path_cache_get(path_join("kernels", cubin_file));
-		VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
-		if(path_exists(cubin)) {
-			VLOG(1) << "Using locally compiled kernel.";
-			return cubin;
-		}
+    if (split) {
+      cflags += " -D__SPLIT__";
+    }
+
+    return cflags;
+  }
+
+  bool compile_check_compiler()
+  {
+    const char *nvcc = cuewCompilerPath();
+    if (nvcc == NULL) {
+      cuda_error_message(
+          "CUDA nvcc compiler not found. "
+          "Install CUDA toolkit in default location.");
+      return false;
+    }
+    const int cuda_version = cuewCompilerVersion();
+    VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << ".";
+    const int major = cuda_version / 10, minor = cuda_version % 10;
+    if (cuda_version == 0) {
+      cuda_error_message("CUDA nvcc compiler version could not be parsed.");
+      return false;
+    }
+    if (cuda_version < 80) {
+      printf(
+          "Unsupported CUDA version %d.%d detected, "
+          "you need CUDA 8.0 or newer.\n",
+          major,
+          minor);
+      return false;
+    }
+    else if (cuda_version != 101) {
+      printf(
+          "CUDA version %d.%d detected, build may succeed but only "
+          "CUDA 10.1 is officially supported.\n",
+          major,
+          minor);
+    }
+    return true;
+  }
+
+  string compile_kernel(const DeviceRequestedFeatures &requested_features,
+                        bool filter = false,
+                        bool split = false)
+  {
+    const char *name, *source;
+    if (filter) {
+      name = "filter";
+      source = "filter.cu";
+    }
+    else if (split) {
+      name = "kernel_split";
+      source = "kernel_split.cu";
+    }
+    else {
+      name = "kernel";
+      source = "kernel.cu";
+    }
+    /* Compute cubin name. */
+    int major, minor;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+    /* Attempt to use kernel provided with Blender. */
+    if (!use_adaptive_compilation()) {
+      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+      if (path_exists(cubin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return cubin;
+      }
+    }
+
+    const string common_cflags = compile_kernel_get_common_cflags(
+        requested_features, filter, split);
+
+    /* Try to use locally compiled kernel. */
+    const string source_path = path_get("source");
+    const string kernel_md5 = path_files_md5_hash(source_path);
+
+    /* We include cflags into md5 so changing cuda toolkit or changing other
+     * compiler command line arguments makes sure cubin gets re-built.
+     */
+    const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
+
+    const string cubin_file = string_printf(
+        "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str());
+    const string cubin = path_cache_get(path_join("kernels", cubin_file));
+    VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+    if (path_exists(cubin)) {
+      VLOG(1) << "Using locally compiled kernel.";
+      return cubin;
+    }
 
 #ifdef _WIN32
-		if(have_precompiled_kernels()) {
-			if(major < 3) {
-				cuda_error_message(string_printf(
-				        "CUDA device requires compute capability 3.0 or up, "
-				        "found %d.%d. Your GPU is not supported.",
-				        major, minor));
-			}
-			else {
-				cuda_error_message(string_printf(
-				        "CUDA binary kernel for this graphics card compute "
-				        "capability (%d.%d) not found.",
-				        major, minor));
-			}
-			return "";
-		}
+    if (have_precompiled_kernels()) {
+      if (major < 3) {
+        cuda_error_message(
+            string_printf("CUDA device requires compute capability 3.0 or up, "
+                          "found %d.%d. Your GPU is not supported.",
+                          major,
+                          minor));
+      }
+      else {
+        cuda_error_message(
+            string_printf("CUDA binary kernel for this graphics card compute "
+                          "capability (%d.%d) not found.",
+                          major,
+                          minor));
+      }
+      return "";
+    }
 #endif
 
-		/* Compile. */
-		if(!compile_check_compiler()) {
-			return "";
-		}
-		const char *nvcc = cuewCompilerPath();
-		const string kernel = path_join(
-		        path_join(source_path, "kernel"),
-		        path_join("kernels",
-		                  path_join("cuda", source)));
-		double starttime = time_dt();
-		printf("Compiling CUDA kernel ...\n");
-
-		path_create_directories(cubin);
-
-		string command = string_printf("\"%s\" "
-		                               "-arch=sm_%d%d "
-		                               "--cubin \"%s\" "
-		                               "-o \"%s\" "
-		                               "%s ",
-		                               nvcc,
-		                               major, minor,
-		                               kernel.c_str(),
-		                               cubin.c_str(),
-		                               common_cflags.c_str());
-
-		printf("%s\n", command.c_str());
-
-		if(system(command.c_str()) == -1) {
-			cuda_error_message("Failed to execute compilation command, "
-			                   "see console for details.");
-			return "";
-		}
-
-		/* Verify if compilation succeeded */
-		if(!path_exists(cubin)) {
-			cuda_error_message("CUDA kernel compilation failed, "
-			                   "see console for details.");
-			return "";
-		}
-
-		printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-		return cubin;
-	}
-
-	bool load_kernels(const DeviceRequestedFeatures& requested_features)
-	{
-		/* TODO(sergey): Support kernels re-load for CUDA devices.
-		 *
-		 * Currently re-loading kernel will invalidate memory pointers,
-		 * causing problems in cuCtxSynchronize.
-		 */
-		if(cuFilterModule && cuModule) {
-			VLOG(1) << "Skipping kernel reload, not currently supported.";
-			return true;
-		}
-
-		/* check if cuda init succeeded */
-		if(cuContext == 0)
-			return false;
-
-		/* check if GPU is supported */
-		if(!support_device(requested_features))
-			return false;
-
-		/* get kernel */
-		string cubin = compile_kernel(requested_features, false, use_split_kernel());
-		if(cubin == "")
-			return false;
-
-		string filter_cubin = compile_kernel(requested_features, true, false);
-		if(filter_cubin == "")
-			return false;
-
-		/* open module */
-		CUDAContextScope scope(this);
-
-		string cubin_data;
-		CUresult result;
-
-		if(path_read_text(cubin, cubin_data))
-			result = cuModuleLoadData(&cuModule, cubin_data.c_str());
-		else
-			result = CUDA_ERROR_FILE_NOT_FOUND;
-
-		if(cuda_error_(result, "cuModuleLoad"))
-			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
-
-		if(path_read_text(filter_cubin, cubin_data))
-			result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
-		else
-			result = CUDA_ERROR_FILE_NOT_FOUND;
-
-		if(cuda_error_(result, "cuModuleLoad"))
-			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
-
-		if(result == CUDA_SUCCESS) {
-			reserve_local_memory(requested_features);
-		}
-
-		return (result == CUDA_SUCCESS);
-	}
-
-	void reserve_local_memory(const DeviceRequestedFeatures& requested_features)
-	{
-		if(use_split_kernel()) {
-			/* Split kernel mostly uses global memory and adaptive compilation,
-			 * difficult to predict how much is needed currently. */
-			return;
-		}
-
-		/* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
-		 * needed for kernel launches, so that we can reliably figure out when
-		 * to allocate scene data in mapped host memory. */
-		CUDAContextScope scope(this);
-
-		size_t total = 0, free_before = 0, free_after = 0;
-		cuMemGetInfo(&free_before, &total);
-
-		/* Get kernel function. */
-		CUfunction cuPathTrace;
-
-		if(requested_features.use_integrator_branched) {
-			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
-		}
-		else {
-			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
-		}
-
-		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
-		int min_blocks, num_threads_per_block;
-		cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-
-		/* Launch kernel, using just 1 block appears sufficient to reserve
-		 * memory for all multiprocessors. It would be good to do this in
-		 * parallel for the multi GPU case still to make it faster. */
-		CUdeviceptr d_work_tiles = 0;
-		uint total_work_size = 0;
-
-		void *args[] = {&d_work_tiles,
-		                &total_work_size};
-
-		cuda_assert(cuLaunchKernel(cuPathTrace,
-		                           1, 1, 1,
-		                           num_threads_per_block, 1, 1,
-		                           0, 0, args, 0));
-
-		cuda_assert(cuCtxSynchronize());
-
-		cuMemGetInfo(&free_after, &total);
-		VLOG(1) << "Local memory reserved "
-		        << string_human_readable_number(free_before - free_after) << " bytes. ("
-		        << string_human_readable_size(free_before - free_after) << ")";
+    /* Compile. */
+    if (!compile_check_compiler()) {
+      return "";
+    }
+    const char *nvcc = cuewCompilerPath();
+    const string kernel = path_join(path_join(source_path, "kernel"),
+                                    path_join("kernels", path_join("cuda", source)));
+    double starttime = time_dt();
+    printf("Compiling CUDA kernel ...\n");
+
+    path_create_directories(cubin);
+
+    string command = string_printf(
+        "\"%s\" "
+        "-arch=sm_%d%d "
+        "--cubin \"%s\" "
+        "-o \"%s\" "
+        "%s ",
+        nvcc,
+        major,
+        minor,
+        kernel.c_str(),
+        cubin.c_str(),
+        common_cflags.c_str());
+
+    printf("%s\n", command.c_str());
+
+    if (system(command.c_str()) == -1) {
+      cuda_error_message(
+          "Failed to execute compilation command, "
+          "see console for details.");
+      return "";
+    }
+
+    /* Verify if compilation succeeded */
+    if (!path_exists(cubin)) {
+      cuda_error_message(
+          "CUDA kernel compilation failed, "
+          "see console for details.");
+      return "";
+    }
+
+    printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+    return cubin;
+  }
+
+  bool load_kernels(const DeviceRequestedFeatures &requested_features)
+  {
+    /* TODO(sergey): Support kernels re-load for CUDA devices.
+     *
+     * Currently re-loading kernel will invalidate memory pointers,
+     * causing problems in cuCtxSynchronize.
+     */
+    if (cuFilterModule && cuModule) {
+      VLOG(1) << "Skipping kernel reload, not currently supported.";
+      return true;
+    }
+
+    /* check if cuda init succeeded */
+    if (cuContext == 0)
+      return false;
+
+    /* check if GPU is supported */
+    if (!support_device(requested_features))
+      return false;
+
+    /* get kernel */
+    string cubin = compile_kernel(requested_features, false, use_split_kernel());
+    if (cubin == "")
+      return false;
+
+    string filter_cubin = compile_kernel(requested_features, true, false);
+    if (filter_cubin == "")
+      return false;
+
+    /* open module */
+    CUDAContextScope scope(this);
+
+    string cubin_data;
+    CUresult result;
+
+    if (path_read_text(cubin, cubin_data))
+      result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+    else
+      result = CUDA_ERROR_FILE_NOT_FOUND;
+
+    if (cuda_error_(result, "cuModuleLoad"))
+      cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
+
+    if (path_read_text(filter_cubin, cubin_data))
+      result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+    else
+      result = CUDA_ERROR_FILE_NOT_FOUND;
+
+    if (cuda_error_(result, "cuModuleLoad"))
+      cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
+    if (result == CUDA_SUCCESS) {
+      reserve_local_memory(requested_features);
+    }
+
+    return (result == CUDA_SUCCESS);
+  }
+
+  void reserve_local_memory(const DeviceRequestedFeatures &requested_features)
+  {
+    if (use_split_kernel()) {
+      /* Split kernel mostly uses global memory and adaptive compilation,
+       * difficult to predict how much is needed currently. */
+      return;
+    }
+
+    /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+     * needed for kernel launches, so that we can reliably figure out when
+     * to allocate scene data in mapped host memory. */
+    CUDAContextScope scope(this);
+
+    size_t total = 0, free_before = 0, free_after = 0;
+    cuMemGetInfo(&free_before, &total);
+
+    /* Get kernel function. */
+    CUfunction cuPathTrace;
+
+    if (requested_features.use_integrator_branched) {
+      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+    }
+    else {
+      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+    }
+
+    cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+    int min_blocks, num_threads_per_block;
+    cuda_assert(cuOccupancyMaxPotentialBlockSize(
+        &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+
+    /* Launch kernel, using just 1 block appears sufficient to reserve
+     * memory for all multiprocessors. It would be good to do this in
+     * parallel for the multi GPU case still to make it faster. */
+    CUdeviceptr d_work_tiles = 0;
+    uint total_work_size = 0;
+
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    cuda_assert(cuCtxSynchronize());
+
+    cuMemGetInfo(&free_after, &total);
+    VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+            << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
 
 #if 0
-		/* For testing mapped host memory, fill up device memory. */
-		const size_t keep_mb = 1024;
-
-		while(free_after > keep_mb * 1024 * 1024LL) {
-			CUdeviceptr tmp;
-			cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
-			cuMemGetInfo(&free_after, &total);
-		}
+    /* For testing mapped host memory, fill up device memory. */
+    const size_t keep_mb = 1024;
+
+    while(free_after > keep_mb * 1024 * 1024LL) {
+      CUdeviceptr tmp;
+      cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+      cuMemGetInfo(&free_after, &total);
+    }
 #endif
-	}
-
-	void init_host_memory()
-	{
-		/* Limit amount of host mapped memory, because allocating too much can
-		 * cause system instability. Leave at least half or 4 GB of system
-		 * memory free, whichever is smaller. */
-		size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-		size_t system_ram = system_physical_ram();
-
-		if(system_ram > 0) {
-			if(system_ram / 2 > default_limit) {
-				map_host_limit = system_ram - default_limit;
-			}
-			else {
-				map_host_limit = system_ram / 2;
-			}
-		}
-		else {
-			VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
-			map_host_limit = 0;
-		}
-
-		/* Amount of device memory to keep is free after texture memory
-		 * and working memory allocations respectively. We set the working
-		 * memory limit headroom lower so that some space is left after all
-		 * texture memory allocations. */
-		device_working_headroom = 32 * 1024 * 1024LL; // 32MB
-		device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
-
-		VLOG(1) << "Mapped host memory limit set to "
-		        << string_human_readable_number(map_host_limit) << " bytes. ("
-		        << string_human_readable_size(map_host_limit) << ")";
-	}
-
-	void load_texture_info()
-	{
-		if(need_texture_info) {
-			texture_info.copy_to_device();
-			need_texture_info = false;
-		}
-	}
-
-	void move_textures_to_host(size_t size, bool for_texture)
-	{
-		/* Signal to reallocate textures in host memory only. */
-		move_texture_to_host = true;
-
-		while(size > 0) {
-			/* Find suitable memory allocation to move. */
-			device_memory *max_mem = NULL;
-			size_t max_size = 0;
-			bool max_is_image = false;
-
-			foreach(CUDAMemMap::value_type& pair, cuda_mem_map) {
-				device_memory& mem = *pair.first;
-				CUDAMem *cmem = &pair.second;
-
-				bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-				bool is_image = is_texture && (mem.data_height > 1);
-
-				/* Can't move this type of memory. */
-				if(!is_texture || cmem->array) {
-					continue;
-				}
-
-				/* Already in host memory. */
-				if(cmem->map_host_pointer) {
-					continue;
-				}
-
-				/* For other textures, only move image textures. */
-				if(for_texture && !is_image) {
-					continue;
-				}
-
-				/* Try to move largest allocation, prefer moving images. */
-				if(is_image > max_is_image ||
-				   (is_image == max_is_image && mem.device_size > max_size)) {
-					max_is_image = is_image;
-					max_size = mem.device_size;
-					max_mem = &mem;
-				}
-			}
-
-			/* Move to host memory. This part is mutex protected since
-			 * multiple CUDA devices could be moving the memory. The
-			 * first one will do it, and the rest will adopt the pointer. */
-			if(max_mem) {
-				VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-				static thread_mutex move_mutex;
-				thread_scoped_lock lock(move_mutex);
-
-				/* Preserve the original device pointer, in case of multi device
-				 * we can't change it because the pointer mapping would break. */
-				device_ptr prev_pointer = max_mem->device_pointer;
-				size_t prev_size = max_mem->device_size;
-
-				tex_free(*max_mem);
-				tex_alloc(*max_mem);
-				size = (max_size >= size)? 0: size - max_size;
-
-				max_mem->device_pointer = prev_pointer;
-				max_mem->device_size = prev_size;
-			}
-			else {
-				break;
-			}
-		}
-
-		/* Update texture info array with new pointers. */
-		load_texture_info();
-
-		move_texture_to_host = false;
-	}
-
-	CUDAMem *generic_alloc(device_memory& mem, size_t pitch_padding = 0)
-	{
-		CUDAContextScope scope(this);
-
-		CUdeviceptr device_pointer = 0;
-		size_t size = mem.memory_size() + pitch_padding;
-
-		CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-		const char *status = "";
-
-		/* First try allocating in device memory, respecting headroom. We make
-		 * an exception for texture info. It is small and frequently accessed,
-		 * so treat it as working memory.
-		 *
-		 * If there is not enough room for working memory, we will try to move
-		 * textures to host memory, assuming the performance impact would have
-		 * been worse for working memory. */
-		bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-		bool is_image = is_texture && (mem.data_height > 1);
-
-		size_t headroom = (is_texture)? device_texture_headroom:
-		                                device_working_headroom;
-
-		size_t total = 0, free = 0;
-		cuMemGetInfo(&free, &total);
-
-		/* Move textures to host memory if needed. */
-		if(!move_texture_to_host && !is_image && (size + headroom) >= free) {
-			move_textures_to_host(size + headroom - free, is_texture);
-			cuMemGetInfo(&free, &total);
-		}
-
-		/* Allocate in device memory. */
-		if(!move_texture_to_host && (size + headroom) < free) {
-			mem_alloc_result = cuMemAlloc(&device_pointer, size);
-			if(mem_alloc_result == CUDA_SUCCESS) {
-				status = " in device memory";
-			}
-		}
-
-		/* Fall back to mapped host memory if needed and possible. */
-		void *map_host_pointer = 0;
-		bool free_map_host = false;
-
-		if(mem_alloc_result != CUDA_SUCCESS && can_map_host &&
-		   map_host_used + size < map_host_limit) {
-			if(mem.shared_pointer) {
-				/* Another device already allocated host memory. */
-				mem_alloc_result = CUDA_SUCCESS;
-				map_host_pointer = mem.shared_pointer;
-			}
-			else {
-				/* Allocate host memory ourselves. */
-				mem_alloc_result = cuMemHostAlloc(&map_host_pointer, size,
-				                                  CU_MEMHOSTALLOC_DEVICEMAP |
-				                                  CU_MEMHOSTALLOC_WRITECOMBINED);
-				mem.shared_pointer = map_host_pointer;
-				free_map_host = true;
-			}
-
-			if(mem_alloc_result == CUDA_SUCCESS) {
-				cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
-				map_host_used += size;
-				status = " in host memory";
-
-				/* Replace host pointer with our host allocation. Only works if
-				 * CUDA memory layout is the same and has no pitch padding. Also
-				 * does not work if we move textures to host during a render,
-				 * since other devices might be using the memory. */
-				if(!move_texture_to_host && pitch_padding == 0 &&
-				   mem.host_pointer && mem.host_pointer != mem.shared_pointer) {
-					memcpy(mem.shared_pointer, mem.host_pointer, size);
-					mem.host_free();
-					mem.host_pointer = mem.shared_pointer;
-				}
-			}
-			else {
-				status = " failed, out of host memory";
-			}
-		}
-		else if(mem_alloc_result != CUDA_SUCCESS) {
-			status = " failed, out of device and host memory";
-		}
-
-		if(mem_alloc_result != CUDA_SUCCESS) {
-			cuda_assert(mem_alloc_result);
-		}
-
-		if(mem.name) {
-			VLOG(1) << "Buffer allocate: " << mem.name << ", "
-					<< string_human_readable_number(mem.memory_size()) << " bytes. ("
-					<< string_human_readable_size(mem.memory_size()) << ")"
-					<< status;
-		}
-
-		mem.device_pointer = (device_ptr)device_pointer;
-		mem.device_size = size;
-		stats.mem_alloc(size);
-
-		if(!mem.device_pointer) {
-			return NULL;
-		}
-
-		/* Insert into map of allocations. */
-		CUDAMem *cmem = &cuda_mem_map[&mem];
-		cmem->map_host_pointer = map_host_pointer;
-		cmem->free_map_host = free_map_host;
-		return cmem;
-	}
-
-	void generic_copy_to(device_memory& mem)
-	{
-		if(mem.host_pointer && mem.device_pointer) {
-			CUDAContextScope scope(this);
-
-			if(mem.host_pointer != mem.shared_pointer) {
-				cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer),
-				                         mem.host_pointer,
-				                         mem.memory_size()));
-			}
-		}
-	}
-
-	void generic_free(device_memory& mem)
-	{
-		if(mem.device_pointer) {
-			CUDAContextScope scope(this);
-			const CUDAMem& cmem = cuda_mem_map[&mem];
-
-			if(cmem.map_host_pointer) {
-				/* Free host memory. */
-				if(cmem.free_map_host) {
-					cuMemFreeHost(cmem.map_host_pointer);
-					if(mem.host_pointer == mem.shared_pointer) {
-						mem.host_pointer = 0;
-					}
-					mem.shared_pointer = 0;
-				}
-
-				map_host_used -= mem.device_size;
-			}
-			else {
-				/* Free device memory. */
-				cuMemFree(mem.device_pointer);
-			}
-
-			stats.mem_free(mem.device_size);
-			mem.device_pointer = 0;
-			mem.device_size = 0;
-
-			cuda_mem_map.erase(cuda_mem_map.find(&mem));
-		}
-	}
-
-	void mem_alloc(device_memory& mem)
-	{
-		if(mem.type == MEM_PIXELS && !background) {
-			pixels_alloc(mem);
-		}
-		else if(mem.type == MEM_TEXTURE) {
-			assert(!"mem_alloc not supported for textures.");
-		}
-		else {
-			generic_alloc(mem);
-		}
-	}
-
-	void mem_copy_to(device_memory& mem)
-	{
-		if(mem.type == MEM_PIXELS) {
-			assert(!"mem_copy_to not supported for pixels.");
-		}
-		else if(mem.type == MEM_TEXTURE) {
-			tex_free(mem);
-			tex_alloc(mem);
-		}
-		else {
-			if(!mem.device_pointer) {
-				generic_alloc(mem);
-			}
-
-			generic_copy_to(mem);
-		}
-	}
-
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
-	{
-		if(mem.type == MEM_PIXELS && !background) {
-			pixels_copy_from(mem, y, w, h);
-		}
-		else if(mem.type == MEM_TEXTURE) {
-			assert(!"mem_copy_from not supported for textures.");
-		}
-		else {
-			CUDAContextScope scope(this);
-			size_t offset = elem*y*w;
-			size_t size = elem*w*h;
-
-			if(mem.host_pointer && mem.device_pointer) {
-				cuda_assert(cuMemcpyDtoH((uchar*)mem.host_pointer + offset,
-										 (CUdeviceptr)(mem.device_pointer + offset), size));
-			}
-			else if(mem.host_pointer) {
-				memset((char*)mem.host_pointer + offset, 0, size);
-			}
-		}
-	}
-
-	void mem_zero(device_memory& mem)
-	{
-		if(!mem.device_pointer) {
-			mem_alloc(mem);
-		}
-
-		if(mem.host_pointer) {
-			memset(mem.host_pointer, 0, mem.memory_size());
-		}
-
-		if(mem.device_pointer &&
-		   (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
-			CUDAContextScope scope(this);
-			cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
-		}
-	}
-
-	void mem_free(device_memory& mem)
-	{
-		if(mem.type == MEM_PIXELS && !background) {
-			pixels_free(mem);
-		}
-		else if(mem.type == MEM_TEXTURE) {
-			tex_free(mem);
-		}
-		else {
-			generic_free(mem);
-		}
-	}
-
-	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
-	{
-		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
-	}
-
-	void const_copy_to(const char *name, void *host, size_t size)
-	{
-		CUDAContextScope scope(this);
-		CUdeviceptr mem;
-		size_t bytes;
-
-		cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
-		//assert(bytes == size);
-		cuda_assert(cuMemcpyHtoD(mem, host, size));
-	}
-
-	void tex_alloc(device_memory& mem)
-	{
-		CUDAContextScope scope(this);
-
-		/* General variables for both architectures */
-		string bind_name = mem.name;
-		size_t dsize = datatype_size(mem.data_type);
-		size_t size = mem.memory_size();
-
-		CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-		switch(mem.extension) {
-			case EXTENSION_REPEAT:
-				address_mode = CU_TR_ADDRESS_MODE_WRAP;
-				break;
-			case EXTENSION_EXTEND:
-				address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-				break;
-			case EXTENSION_CLIP:
-				address_mode = CU_TR_ADDRESS_MODE_BORDER;
-				break;
-			default:
-				assert(0);
-				break;
-		}
-
-		CUfilter_mode filter_mode;
-		if(mem.interpolation == INTERPOLATION_CLOSEST) {
-			filter_mode = CU_TR_FILTER_MODE_POINT;
-		}
-		else {
-			filter_mode = CU_TR_FILTER_MODE_LINEAR;
-		}
-
-		/* Data Storage */
-		if(mem.interpolation == INTERPOLATION_NONE) {
-			generic_alloc(mem);
-			generic_copy_to(mem);
-
-			CUdeviceptr cumem;
-			size_t cubytes;
-
-			cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
-
-			if(cubytes == 8) {
-				/* 64 bit device pointer */
-				uint64_t ptr = mem.device_pointer;
-				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
-			}
-			else {
-				/* 32 bit device pointer */
-				uint32_t ptr = (uint32_t)mem.device_pointer;
-				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
-			}
-			return;
-		}
-
-		/* Image Texture Storage */
-		CUarray_format_enum format;
-		switch(mem.data_type) {
-			case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
-			case TYPE_UINT16: format = CU_AD_FORMAT_UNSIGNED_INT16; break;
-			case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
-			case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
-			case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
-			case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
-			default: assert(0); return;
-		}
-
-		CUDAMem *cmem = NULL;
-		CUarray array_3d = NULL;
-		size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-		size_t dst_pitch = src_pitch;
-
-		if(mem.data_depth > 1) {
-			/* 3D texture using array, there is no API for linear memory. */
-			CUDA_ARRAY3D_DESCRIPTOR desc;
-
-			desc.Width = mem.data_width;
-			desc.Height = mem.data_height;
-			desc.Depth = mem.data_depth;
-			desc.Format = format;
-			desc.NumChannels = mem.data_elements;
-			desc.Flags = 0;
-
-			VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-			        << string_human_readable_size(mem.memory_size()) << ")";
-
-			cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
-			if(!array_3d) {
-				return;
-			}
-
-			CUDA_MEMCPY3D param;
-			memset(&param, 0, sizeof(param));
-			param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-			param.dstArray = array_3d;
-			param.srcMemoryType = CU_MEMORYTYPE_HOST;
-			param.srcHost = mem.host_pointer;
-			param.srcPitch = src_pitch;
-			param.WidthInBytes = param.srcPitch;
-			param.Height = mem.data_height;
-			param.Depth = mem.data_depth;
-
-			cuda_assert(cuMemcpy3D(&param));
-
-			mem.device_pointer = (device_ptr)array_3d;
-			mem.device_size = size;
-			stats.mem_alloc(size);
-
-			cmem = &cuda_mem_map[&mem];
-			cmem->texobject = 0;
-			cmem->array = array_3d;
-		}
-		else if(mem.data_height > 0) {
-			/* 2D texture, using pitch aligned linear memory. */
-			int alignment = 0;
-			cuda_assert(cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-			dst_pitch = align_up(src_pitch, alignment);
-			size_t dst_size = dst_pitch * mem.data_height;
-
-			cmem = generic_alloc(mem, dst_size - mem.memory_size());
-			if(!cmem) {
-				return;
-			}
-
-			CUDA_MEMCPY2D param;
-			memset(&param, 0, sizeof(param));
-			param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-			param.dstDevice = mem.device_pointer;
-			param.dstPitch = dst_pitch;
-			param.srcMemoryType = CU_MEMORYTYPE_HOST;
-			param.srcHost = mem.host_pointer;
-			param.srcPitch = src_pitch;
-			param.WidthInBytes = param.srcPitch;
-			param.Height = mem.data_height;
-
-			cuda_assert(cuMemcpy2DUnaligned(&param));
-		}
-		else {
-			/* 1D texture, using linear memory. */
-			cmem = generic_alloc(mem);
-			if(!cmem) {
-				return;
-			}
-
-			cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-		}
-
-		/* Kepler+, bindless textures. */
-		int flat_slot = 0;
-		if(string_startswith(mem.name, "__tex_image")) {
-			int pos =  string(mem.name).rfind("_");
-			flat_slot = atoi(mem.name + pos + 1);
-		}
-		else {
-			assert(0);
-		}
-
-		CUDA_RESOURCE_DESC resDesc;
-		memset(&resDesc, 0, sizeof(resDesc));
-
-		if(array_3d) {
-			resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-			resDesc.res.array.hArray = array_3d;
-			resDesc.flags = 0;
-		}
-		else if(mem.data_height > 0) {
-			resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-			resDesc.res.pitch2D.devPtr = mem.device_pointer;
-			resDesc.res.pitch2D.format = format;
-			resDesc.res.pitch2D.numChannels = mem.data_elements;
-			resDesc.res.pitch2D.height = mem.data_height;
-			resDesc.res.pitch2D.width = mem.data_width;
-			resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-		}
-		else {
-			resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-			resDesc.res.linear.devPtr = mem.device_pointer;
-			resDesc.res.linear.format = format;
-			resDesc.res.linear.numChannels = mem.data_elements;
-			resDesc.res.linear.sizeInBytes = mem.device_size;
-		}
-
-		CUDA_TEXTURE_DESC texDesc;
-		memset(&texDesc, 0, sizeof(texDesc));
-		texDesc.addressMode[0] = address_mode;
-		texDesc.addressMode[1] = address_mode;
-		texDesc.addressMode[2] = address_mode;
-		texDesc.filterMode = filter_mode;
-		texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-		cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-		/* Resize once */
-		if(flat_slot >= texture_info.size()) {
-			/* Allocate some slots in advance, to reduce amount
-			 * of re-allocations. */
-			texture_info.resize(flat_slot + 128);
-		}
-
-		/* Set Mapping and tag that we need to (re-)upload to device */
-		TextureInfo& info = texture_info[flat_slot];
-		info.data = (uint64_t)cmem->texobject;
-		info.cl_buffer = 0;
-		info.interpolation = mem.interpolation;
-		info.extension = mem.extension;
-		info.width = mem.data_width;
-		info.height = mem.data_height;
-		info.depth = mem.data_depth;
-		need_texture_info = true;
-	}
-
-	void tex_free(device_memory& mem)
-	{
-		if(mem.device_pointer) {
-			CUDAContextScope scope(this);
-			const CUDAMem& cmem = cuda_mem_map[&mem];
-
-			if(cmem.texobject) {
-				/* Free bindless texture. */
-				cuTexObjectDestroy(cmem.texobject);
-			}
-
-			if(cmem.array) {
-				/* Free array. */
-				cuArrayDestroy(cmem.array);
-				stats.mem_free(mem.device_size);
-				mem.device_pointer = 0;
-				mem.device_size = 0;
-
-				cuda_mem_map.erase(cuda_mem_map.find(&mem));
-			}
-			else {
-				generic_free(mem);
-			}
-		}
-	}
-
-#define CUDA_GET_BLOCKSIZE(func, w, h)                                                                          \
-			int threads_per_block;                                                                              \
-			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-			int threads = (int)sqrt((float)threads_per_block);                                                  \
-			int xblocks = ((w) + threads - 1)/threads;                                                          \
-			int yblocks = ((h) + threads - 1)/threads;
-
-#define CUDA_LAUNCH_KERNEL(func, args)                      \
-			cuda_assert(cuLaunchKernel(func,                \
-			                           xblocks, yblocks, 1, \
-			                           threads, threads, 1, \
-			                           0, 0, args, 0));
+  }
+
+  void init_host_memory()
+  {
+    /* Limit amount of host mapped memory, because allocating too much can
+     * cause system instability. Leave at least half or 4 GB of system
+     * memory free, whichever is smaller. */
+    size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+    size_t system_ram = system_physical_ram();
+
+    if (system_ram > 0) {
+      if (system_ram / 2 > default_limit) {
+        map_host_limit = system_ram - default_limit;
+      }
+      else {
+        map_host_limit = system_ram / 2;
+      }
+    }
+    else {
+      VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+      map_host_limit = 0;
+    }
+
+    /* Amount of device memory to keep is free after texture memory
+     * and working memory allocations respectively. We set the working
+     * memory limit headroom lower so that some space is left after all
+     * texture memory allocations. */
+    device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+    device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+    VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+  }
+
+  void load_texture_info()
+  {
+    if (need_texture_info) {
+      texture_info.copy_to_device();
+      need_texture_info = false;
+    }
+  }
+
+  void move_textures_to_host(size_t size, bool for_texture)
+  {
+    /* Signal to reallocate textures in host memory only. */
+    move_texture_to_host = true;
+
+    while (size > 0) {
+      /* Find suitable memory allocation to move. */
+      device_memory *max_mem = NULL;
+      size_t max_size = 0;
+      bool max_is_image = false;
+
+      foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+        device_memory &mem = *pair.first;
+        CUDAMem *cmem = &pair.second;
+
+        bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+        bool is_image = is_texture && (mem.data_height > 1);
+
+        /* Can't move this type of memory. */
+        if (!is_texture || cmem->array) {
+          continue;
+        }
+
+        /* Already in host memory. */
+        if (cmem->map_host_pointer) {
+          continue;
+        }
+
+        /* For other textures, only move image textures. */
+        if (for_texture && !is_image) {
+          continue;
+        }
+
+        /* Try to move largest allocation, prefer moving images. */
+        if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+          max_is_image = is_image;
+          max_size = mem.device_size;
+          max_mem = &mem;
+        }
+      }
+
+      /* Move to host memory. This part is mutex protected since
+       * multiple CUDA devices could be moving the memory. The
+       * first one will do it, and the rest will adopt the pointer. */
+      if (max_mem) {
+        VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+        static thread_mutex move_mutex;
+        thread_scoped_lock lock(move_mutex);
+
+        /* Preserve the original device pointer, in case of multi device
+         * we can't change it because the pointer mapping would break. */
+        device_ptr prev_pointer = max_mem->device_pointer;
+        size_t prev_size = max_mem->device_size;
+
+        tex_free(*max_mem);
+        tex_alloc(*max_mem);
+        size = (max_size >= size) ? 0 : size - max_size;
+
+        max_mem->device_pointer = prev_pointer;
+        max_mem->device_size = prev_size;
+      }
+      else {
+        break;
+      }
+    }
+
+    /* Update texture info array with new pointers. */
+    load_texture_info();
+
+    move_texture_to_host = false;
+  }
+
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
+  {
+    CUDAContextScope scope(this);
+
+    CUdeviceptr device_pointer = 0;
+    size_t size = mem.memory_size() + pitch_padding;
+
+    CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+    const char *status = "";
+
+    /* First try allocating in device memory, respecting headroom. We make
+     * an exception for texture info. It is small and frequently accessed,
+     * so treat it as working memory.
+     *
+     * If there is not enough room for working memory, we will try to move
+     * textures to host memory, assuming the performance impact would have
+     * been worse for working memory. */
+    bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+    bool is_image = is_texture && (mem.data_height > 1);
+
+    size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+    size_t total = 0, free = 0;
+    cuMemGetInfo(&free, &total);
+
+    /* Move textures to host memory if needed. */
+    if (!move_texture_to_host && !is_image && (size + headroom) >= free) {
+      move_textures_to_host(size + headroom - free, is_texture);
+      cuMemGetInfo(&free, &total);
+    }
+
+    /* Allocate in device memory. */
+    if (!move_texture_to_host && (size + headroom) < free) {
+      mem_alloc_result = cuMemAlloc(&device_pointer, size);
+      if (mem_alloc_result == CUDA_SUCCESS) {
+        status = " in device memory";
+      }
+    }
+
+    /* Fall back to mapped host memory if needed and possible. */
+    void *map_host_pointer = 0;
+    bool free_map_host = false;
+
+    if (mem_alloc_result != CUDA_SUCCESS && can_map_host &&
+        map_host_used + size < map_host_limit) {
+      if (mem.shared_pointer) {
+        /* Another device already allocated host memory. */
+        mem_alloc_result = CUDA_SUCCESS;
+        map_host_pointer = mem.shared_pointer;
+      }
+      else {
+        /* Allocate host memory ourselves. */
+        mem_alloc_result = cuMemHostAlloc(
+            &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+        mem.shared_pointer = map_host_pointer;
+        free_map_host = true;
+      }
+
+      if (mem_alloc_result == CUDA_SUCCESS) {
+        cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
+        map_host_used += size;
+        status = " in host memory";
+
+        /* Replace host pointer with our host allocation. Only works if
+         * CUDA memory layout is the same and has no pitch padding. Also
+         * does not work if we move textures to host during a render,
+         * since other devices might be using the memory. */
+        if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+            mem.host_pointer != mem.shared_pointer) {
+          memcpy(mem.shared_pointer, mem.host_pointer, size);
+          mem.host_free();
+          mem.host_pointer = mem.shared_pointer;
+        }
+      }
+      else {
+        status = " failed, out of host memory";
+      }
+    }
+    else if (mem_alloc_result != CUDA_SUCCESS) {
+      status = " failed, out of device and host memory";
+    }
+
+    if (mem_alloc_result != CUDA_SUCCESS) {
+      cuda_assert(mem_alloc_result);
+    }
+
+    if (mem.name) {
+      VLOG(1) << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")" << status;
+    }
+
+    mem.device_pointer = (device_ptr)device_pointer;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    if (!mem.device_pointer) {
+      return NULL;
+    }
+
+    /* Insert into map of allocations. */
+    CUDAMem *cmem = &cuda_mem_map[&mem];
+    cmem->map_host_pointer = map_host_pointer;
+    cmem->free_map_host = free_map_host;
+    return cmem;
+  }
+
+  void generic_copy_to(device_memory &mem)
+  {
+    if (mem.host_pointer && mem.device_pointer) {
+      CUDAContextScope scope(this);
+
+      if (mem.host_pointer != mem.shared_pointer) {
+        cuda_assert(cuMemcpyHtoD(
+            cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
+      }
+    }
+  }
+
+  void generic_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      CUDAContextScope scope(this);
+      const CUDAMem &cmem = cuda_mem_map[&mem];
+
+      if (cmem.map_host_pointer) {
+        /* Free host memory. */
+        if (cmem.free_map_host) {
+          cuMemFreeHost(cmem.map_host_pointer);
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          mem.shared_pointer = 0;
+        }
+
+        map_host_used -= mem.device_size;
+      }
+      else {
+        /* Free device memory. */
+        cuMemFree(mem.device_pointer);
+      }
+
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+  }
+
+  void mem_alloc(device_memory &mem)
+  {
+    if (mem.type == MEM_PIXELS && !background) {
+      pixels_alloc(mem);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      assert(!"mem_alloc not supported for textures.");
+    }
+    else {
+      generic_alloc(mem);
+    }
+  }
+
+  void mem_copy_to(device_memory &mem)
+  {
+    if (mem.type == MEM_PIXELS) {
+      assert(!"mem_copy_to not supported for pixels.");
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_free(mem);
+      tex_alloc(mem);
+    }
+    else {
+      if (!mem.device_pointer) {
+        generic_alloc(mem);
+      }
+
+      generic_copy_to(mem);
+    }
+  }
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+  {
+    if (mem.type == MEM_PIXELS && !background) {
+      pixels_copy_from(mem, y, w, h);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      assert(!"mem_copy_from not supported for textures.");
+    }
+    else {
+      CUDAContextScope scope(this);
+      size_t offset = elem * y * w;
+      size_t size = elem * w * h;
+
+      if (mem.host_pointer && mem.device_pointer) {
+        cuda_assert(cuMemcpyDtoH(
+            (uchar *)mem.host_pointer + offset, (CUdeviceptr)(mem.device_pointer + offset), size));
+      }
+      else if (mem.host_pointer) {
+        memset((char *)mem.host_pointer + offset, 0, size);
+      }
+    }
+  }
+
+  void mem_zero(device_memory &mem)
+  {
+    if (!mem.device_pointer) {
+      mem_alloc(mem);
+    }
+
+    if (mem.host_pointer) {
+      memset(mem.host_pointer, 0, mem.memory_size());
+    }
+
+    if (mem.device_pointer && (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
+      CUDAContextScope scope(this);
+      cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
+    }
+  }
+
+  void mem_free(device_memory &mem)
+  {
+    if (mem.type == MEM_PIXELS && !background) {
+      pixels_free(mem);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_free(mem);
+    }
+    else {
+      generic_free(mem);
+    }
+  }
+
+  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+  {
+    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size)
+  {
+    CUDAContextScope scope(this);
+    CUdeviceptr mem;
+    size_t bytes;
+
+    cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+    //assert(bytes == size);
+    cuda_assert(cuMemcpyHtoD(mem, host, size));
+  }
+
+  void tex_alloc(device_memory &mem)
+  {
+    CUDAContextScope scope(this);
+
+    /* General variables for both architectures */
+    string bind_name = mem.name;
+    size_t dsize = datatype_size(mem.data_type);
+    size_t size = mem.memory_size();
+
+    CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+    switch (mem.extension) {
+      case EXTENSION_REPEAT:
+        address_mode = CU_TR_ADDRESS_MODE_WRAP;
+        break;
+      case EXTENSION_EXTEND:
+        address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+        break;
+      case EXTENSION_CLIP:
+        address_mode = CU_TR_ADDRESS_MODE_BORDER;
+        break;
+      default:
+        assert(0);
+        break;
+    }
+
+    CUfilter_mode filter_mode;
+    if (mem.interpolation == INTERPOLATION_CLOSEST) {
+      filter_mode = CU_TR_FILTER_MODE_POINT;
+    }
+    else {
+      filter_mode = CU_TR_FILTER_MODE_LINEAR;
+    }
+
+    /* Data Storage */
+    if (mem.interpolation == INTERPOLATION_NONE) {
+      generic_alloc(mem);
+      generic_copy_to(mem);
+
+      CUdeviceptr cumem;
+      size_t cubytes;
+
+      cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
+
+      if (cubytes == 8) {
+        /* 64 bit device pointer */
+        uint64_t ptr = mem.device_pointer;
+        cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
+      }
+      else {
+        /* 32 bit device pointer */
+        uint32_t ptr = (uint32_t)mem.device_pointer;
+        cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
+      }
+      return;
+    }
+
+    /* Image Texture Storage */
+    CUarray_format_enum format;
+    switch (mem.data_type) {
+      case TYPE_UCHAR:
+        format = CU_AD_FORMAT_UNSIGNED_INT8;
+        break;
+      case TYPE_UINT16:
+        format = CU_AD_FORMAT_UNSIGNED_INT16;
+        break;
+      case TYPE_UINT:
+        format = CU_AD_FORMAT_UNSIGNED_INT32;
+        break;
+      case TYPE_INT:
+        format = CU_AD_FORMAT_SIGNED_INT32;
+        break;
+      case TYPE_FLOAT:
+        format = CU_AD_FORMAT_FLOAT;
+        break;
+      case TYPE_HALF:
+        format = CU_AD_FORMAT_HALF;
+        break;
+      default:
+        assert(0);
+        return;
+    }
+
+    CUDAMem *cmem = NULL;
+    CUarray array_3d = NULL;
+    size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+    size_t dst_pitch = src_pitch;
+
+    if (mem.data_depth > 1) {
+      /* 3D texture using array, there is no API for linear memory. */
+      CUDA_ARRAY3D_DESCRIPTOR desc;
+
+      desc.Width = mem.data_width;
+      desc.Height = mem.data_height;
+      desc.Depth = mem.data_depth;
+      desc.Format = format;
+      desc.NumChannels = mem.data_elements;
+      desc.Flags = 0;
+
+      VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")";
+
+      cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+      if (!array_3d) {
+        return;
+      }
+
+      CUDA_MEMCPY3D param;
+      memset(&param, 0, sizeof(param));
+      param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+      param.dstArray = array_3d;
+      param.srcMemoryType = CU_MEMORYTYPE_HOST;
+      param.srcHost = mem.host_pointer;
+      param.srcPitch = src_pitch;
+      param.WidthInBytes = param.srcPitch;
+      param.Height = mem.data_height;
+      param.Depth = mem.data_depth;
+
+      cuda_assert(cuMemcpy3D(&param));
+
+      mem.device_pointer = (device_ptr)array_3d;
+      mem.device_size = size;
+      stats.mem_alloc(size);
+
+      cmem = &cuda_mem_map[&mem];
+      cmem->texobject = 0;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      /* 2D texture, using pitch aligned linear memory. */
+      int alignment = 0;
+      cuda_assert(
+          cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+      dst_pitch = align_up(src_pitch, alignment);
+      size_t dst_size = dst_pitch * mem.data_height;
+
+      cmem = generic_alloc(mem, dst_size - mem.memory_size());
+      if (!cmem) {
+        return;
+      }
+
+      CUDA_MEMCPY2D param;
+      memset(&param, 0, sizeof(param));
+      param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+      param.dstDevice = mem.device_pointer;
+      param.dstPitch = dst_pitch;
+      param.srcMemoryType = CU_MEMORYTYPE_HOST;
+      param.srcHost = mem.host_pointer;
+      param.srcPitch = src_pitch;
+      param.WidthInBytes = param.srcPitch;
+      param.Height = mem.data_height;
+
+      cuda_assert(cuMemcpy2DUnaligned(&param));
+    }
+    else {
+      /* 1D texture, using linear memory. */
+      cmem = generic_alloc(mem);
+      if (!cmem) {
+        return;
+      }
+
+      cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+    }
+
+    /* Kepler+, bindless textures. */
+    int flat_slot = 0;
+    if (string_startswith(mem.name, "__tex_image")) {
+      int pos = string(mem.name).rfind("_");
+      flat_slot = atoi(mem.name + pos + 1);
+    }
+    else {
+      assert(0);
+    }
+
+    CUDA_RESOURCE_DESC resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+
+    if (array_3d) {
+      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+      resDesc.res.array.hArray = array_3d;
+      resDesc.flags = 0;
+    }
+    else if (mem.data_height > 0) {
+      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+      resDesc.res.pitch2D.devPtr = mem.device_pointer;
+      resDesc.res.pitch2D.format = format;
+      resDesc.res.pitch2D.numChannels = mem.data_elements;
+      resDesc.res.pitch2D.height = mem.data_height;
+      resDesc.res.pitch2D.width = mem.data_width;
+      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+    }
+    else {
+      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+      resDesc.res.linear.devPtr = mem.device_pointer;
+      resDesc.res.linear.format = format;
+      resDesc.res.linear.numChannels = mem.data_elements;
+      resDesc.res.linear.sizeInBytes = mem.device_size;
+    }
+
+    CUDA_TEXTURE_DESC texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = address_mode;
+    texDesc.addressMode[1] = address_mode;
+    texDesc.addressMode[2] = address_mode;
+    texDesc.filterMode = filter_mode;
+    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+    /* Resize once */
+    if (flat_slot >= texture_info.size()) {
+      /* Allocate some slots in advance, to reduce amount
+       * of re-allocations. */
+      texture_info.resize(flat_slot + 128);
+    }
+
+    /* Set Mapping and tag that we need to (re-)upload to device */
+    TextureInfo &info = texture_info[flat_slot];
+    info.data = (uint64_t)cmem->texobject;
+    info.cl_buffer = 0;
+    info.interpolation = mem.interpolation;
+    info.extension = mem.extension;
+    info.width = mem.data_width;
+    info.height = mem.data_height;
+    info.depth = mem.data_depth;
+    need_texture_info = true;
+  }
+
+  void tex_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      CUDAContextScope scope(this);
+      const CUDAMem &cmem = cuda_mem_map[&mem];
+
+      if (cmem.texobject) {
+        /* Free bindless texture. */
+        cuTexObjectDestroy(cmem.texobject);
+      }
+
+      if (cmem.array) {
+        /* Free array. */
+        cuArrayDestroy(cmem.array);
+        stats.mem_free(mem.device_size);
+        mem.device_pointer = 0;
+        mem.device_size = 0;
+
+        cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      }
+      else {
+        generic_free(mem);
+      }
+    }
+  }
+
+#define CUDA_GET_BLOCKSIZE(func, w, h) \
+  int threads_per_block; \
+  cuda_assert( \
+      cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+  int threads = (int)sqrt((float)threads_per_block); \
+  int xblocks = ((w) + threads - 1) / threads; \
+  int yblocks = ((h) + threads - 1) / threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args) \
+  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
 
 /* Similar as above, but for 1-dimensional blocks. */
-#define CUDA_GET_BLOCKSIZE_1D(func, w, h)                                                                       \
-			int threads_per_block;                                                                              \
-			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-			int xblocks = ((w) + threads_per_block - 1)/threads_per_block;                                      \
-			int yblocks = h;
-
-#define CUDA_LAUNCH_KERNEL_1D(func, args)                       \
-			cuda_assert(cuLaunchKernel(func,                    \
-			                           xblocks, yblocks, 1,     \
-			                           threads_per_block, 1, 1, \
-			                           0, 0, args, 0));
-
-	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
-	                               DenoisingTask *task)
-	{
-		if(have_error())
-			return false;
-
-		CUDAContextScope scope(this);
-
-		int stride = task->buffer.stride;
-		int w = task->buffer.width;
-		int h = task->buffer.h;
-		int r = task->nlm_state.r;
-		int f = task->nlm_state.f;
-		float a = task->nlm_state.a;
-		float k_2 = task->nlm_state.k_2;
-
-		int pass_stride = task->buffer.pass_stride;
-		int num_shifts = (2*r+1)*(2*r+1);
-		int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
-		int frame_offset = 0;
-
-		if(have_error())
-			return false;
-
-		CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
-		CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
-		CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts;
-		CUdeviceptr scale_ptr = 0;
-
-		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride));
-		cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride));
-
-		{
-			CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-			cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-			cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-			cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-			cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
-			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-			cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
-			cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
-			cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
-
-			CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
-
-			void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2};
-			void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-			void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-			void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
-
-			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-			CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-			CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-			CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-		}
-
-		{
-			CUfunction cuNLMNormalize;
-			cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
-			cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-			void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-			CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-			CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-			cuda_assert(cuCtxSynchronize());
-		}
-
-		return !have_error();
-	}
-
-	bool denoising_construct_transform(DenoisingTask *task)
-	{
-		if(have_error())
-			return false;
-
-		CUDAContextScope scope(this);
-
-		CUfunction cuFilterConstructTransform;
-		cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
-		cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-		CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
-		                   task->storage.w,
-		                   task->storage.h);
-
-		void *args[] = {&task->buffer.mem.device_pointer,
-		                &task->tile_info_mem.device_pointer,
-		                &task->storage.transform.device_pointer,
-		                &task->storage.rank.device_pointer,
-		                &task->filter_area,
-		                &task->rect,
-		                &task->radius,
-		                &task->pca_threshold,
-		                &task->buffer.pass_stride,
-		                &task->buffer.frame_stride,
-		                &task->buffer.use_time};
-		CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-		cuda_assert(cuCtxSynchronize());
-
-		return !have_error();
-	}
-
-	bool denoising_accumulate(device_ptr color_ptr,
-	                          device_ptr color_variance_ptr,
-	                          device_ptr scale_ptr,
-	                          int frame,
-	                          DenoisingTask *task)
-	{
-		if(have_error())
-			return false;
-
-		CUDAContextScope scope(this);
-
-		int r = task->radius;
-		int f = 4;
-		float a = 1.0f;
-		float k_2 = task->nlm_k_2;
-
-		int w = task->reconstruction_state.source_w;
-		int h = task->reconstruction_state.source_h;
-		int stride = task->buffer.stride;
-		int frame_offset = frame * task->buffer.frame_stride;
-		int t = task->tile_info->frames[frame];
-
-		int pass_stride = task->buffer.pass_stride;
-		int num_shifts = (2*r+1)*(2*r+1);
-
-		if(have_error())
-			return false;
-
-		CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
-		CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
-
-		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-		cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-		cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-		CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-		                     task->reconstruction_state.source_w * task->reconstruction_state.source_h,
-		                     num_shifts);
-
-		void *calc_difference_args[] = {&color_ptr,
-		                                &color_variance_ptr,
-		                                &scale_ptr,
-		                                &difference,
-		                                &w, &h,
-		                                &stride, &pass_stride,
-		                                &r, &pass_stride,
-		                                &frame_offset,
-		                                &a, &k_2};
-		void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-		void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-		void *construct_gramian_args[] = {&t,
-		                                  &blurDifference,
-		                                  &task->buffer.mem.device_pointer,
-		                                  &task->storage.transform.device_pointer,
-		                                  &task->storage.rank.device_pointer,
-		                                  &task->storage.XtWX.device_pointer,
-		                                  &task->storage.XtWY.device_pointer,
-		                                  &task->reconstruction_state.filter_window,
-		                                  &w, &h, &stride,
-		                                  &pass_stride, &r,
-		                                  &f,
-		                                  &frame_offset,
-		                                  &task->buffer.use_time};
-
-		CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-		CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-		CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-		CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-		CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-		cuda_assert(cuCtxSynchronize());
-
-		return !have_error();
-	}
-
-	bool denoising_solve(device_ptr output_ptr,
-	                     DenoisingTask *task)
-	{
-		CUfunction cuFinalize;
-		cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-		cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-		void *finalize_args[] = {&output_ptr,
-		                         &task->storage.rank.device_pointer,
-		                         &task->storage.XtWX.device_pointer,
-		                         &task->storage.XtWY.device_pointer,
-		                         &task->filter_area,
-		                         &task->reconstruction_state.buffer_params.x,
-		                         &task->render_buffer.samples};
-		CUDA_GET_BLOCKSIZE(cuFinalize,
-		                   task->reconstruction_state.source_w,
-		                   task->reconstruction_state.source_h);
-		CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-		cuda_assert(cuCtxSynchronize());
-
-		return !have_error();
-	}
-
-	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
-	                              device_ptr mean_ptr, device_ptr variance_ptr,
-	                              int r, int4 rect, DenoisingTask *task)
-	{
-		if(have_error())
-			return false;
-
-		CUDAContextScope scope(this);
-
-		CUfunction cuFilterCombineHalves;
-		cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
-		cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-		CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
-		                   task->rect.z-task->rect.x,
-		                   task->rect.w-task->rect.y);
-
-		void *args[] = {&mean_ptr,
-		                &variance_ptr,
-		                &a_ptr,
-		                &b_ptr,
-		                &rect,
-		                &r};
-		CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-		cuda_assert(cuCtxSynchronize());
-
-		return !have_error();
-	}
-
-	bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
-	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
-	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
-	{
-		if(have_error())
-			return false;
-
-		CUDAContextScope scope(this);
-
-		CUfunction cuFilterDivideShadow;
-		cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
-		cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-		CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
-		                   task->rect.z-task->rect.x,
-		                   task->rect.w-task->rect.y);
-
-		void *args[] = {&task->render_buffer.samples,
-		                &task->tile_info_mem.device_pointer,
-		                &a_ptr,
-		                &b_ptr,
-		                &sample_variance_ptr,
-		                &sv_variance_ptr,
-		                &buffer_variance_ptr,
-		                &task->rect,
-		                &task->render_buffer.pass_stride,
-		                &task->render_buffer.offset};
-		CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-		cuda_assert(cuCtxSynchronize());
-
-		return !have_error();
-	}
-
-	bool denoising_get_feature(int mean_offset,
-	                           int variance_offset,
-	                           device_ptr mean_ptr,
-	                           device_ptr variance_ptr,
-	                           float scale,
-	                           DenoisingTask *task)
-	{
-		if(have_error())
-			return false;
-
-		CUDAContextScope scope(this);
-
-		CUfunction cuFilterGetFeature;
-		cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
-		cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-		CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
-		                   task->rect.z-task->rect.x,
-		                   task->rect.w-task->rect.y);
-
-		void *args[] = {&task->render_buffer.samples,
-		                &task->tile_info_mem.device_pointer,
-		                &mean_offset,
-		                &variance_offset,
-		                &mean_ptr,
-		                &variance_ptr,
-		                &scale,
-		                &task->rect,
-		                &task->render_buffer.pass_stride,
-		                &task->render_buffer.offset};
-		CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-		cuda_assert(cuCtxSynchronize());
-
-		return !have_error();
-	}
-
-	bool denoising_write_feature(int out_offset,
-	                             device_ptr from_ptr,
-	                             device_ptr buffer_ptr,
-	                             DenoisingTask *task)
-	{
-		if(have_error())
-			return false;
-
-		CUDAContextScope scope(this);
-
-		CUfunction cuFilterWriteFeature;
-		cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
-		cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-		CUDA_GET_BLOCKSIZE(cuFilterWriteFeature,
-		                   task->filter_area.z,
-		                   task->filter_area.w);
-
-		void *args[] = {&task->render_buffer.samples,
-		                &task->reconstruction_state.buffer_params,
-		                &task->filter_area,
-		                &from_ptr,
-		                &buffer_ptr,
-		                &out_offset,
-		                &task->rect};
-		CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-		cuda_assert(cuCtxSynchronize());
-
-		return !have_error();
-	}
-
-	bool denoising_detect_outliers(device_ptr image_ptr,
-	                               device_ptr variance_ptr,
-	                               device_ptr depth_ptr,
-	                               device_ptr output_ptr,
-	                               DenoisingTask *task)
-	{
-		if(have_error())
-			return false;
-
-		CUDAContextScope scope(this);
-
-		CUfunction cuFilterDetectOutliers;
-		cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
-		cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-		CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers,
-		                   task->rect.z-task->rect.x,
-		                   task->rect.w-task->rect.y);
-
-		void *args[] = {&image_ptr,
-		                &variance_ptr,
-		                &depth_ptr,
-		                &output_ptr,
-		                &task->rect,
-		                &task->buffer.pass_stride};
-
-		CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-		cuda_assert(cuCtxSynchronize());
-
-		return !have_error();
-	}
-
-	void denoise(RenderTile &rtile, DenoisingTask& denoising)
-	{
-		denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
-		denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-		denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
-		denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-		denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-		denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-		denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-		denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-		denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-		denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-		denoising.render_buffer.samples = rtile.sample;
-		denoising.buffer.gpu_temporary_mem = true;
-
-		denoising.run_denoising(&rtile);
-	}
-
-	void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles)
-	{
-		scoped_timer timer(&rtile.buffers->render_time);
-
-		if(have_error())
-			return;
-
-		CUDAContextScope scope(this);
-		CUfunction cuPathTrace;
-
-		/* Get kernel function. */
-		if(task.integrator_branched) {
-			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
-		}
-		else {
-			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
-		}
-
-		if(have_error()) {
-			return;
-		}
-
-		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
-		/* Allocate work tile. */
-		work_tiles.alloc(1);
-
-		WorkTile *wtile = work_tiles.data();
-		wtile->x = rtile.x;
-		wtile->y = rtile.y;
-		wtile->w = rtile.w;
-		wtile->h = rtile.h;
-		wtile->offset = rtile.offset;
-		wtile->stride = rtile.stride;
-		wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
-
-		/* Prepare work size. More step samples render faster, but for now we
-		 * remain conservative for GPUs connected to a display to avoid driver
-		 * timeouts and display freezing. */
-		int min_blocks, num_threads_per_block;
-		cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-		if(!info.display_device) {
-			min_blocks *= 8;
-		}
-
-		uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-		/* Render all samples. */
-		int start_sample = rtile.start_sample;
-		int end_sample = rtile.start_sample + rtile.num_samples;
-
-		for(int sample = start_sample; sample < end_sample; sample += step_samples) {
-			/* Setup and copy work tile to device. */
-			wtile->start_sample = sample;
-			wtile->num_samples = min(step_samples, end_sample - sample);
-			work_tiles.copy_to_device();
-
-			CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
-			uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-			uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-			/* Launch kernel. */
-			void *args[] = {&d_work_tiles,
-			                &total_work_size};
-
-			cuda_assert(cuLaunchKernel(cuPathTrace,
-			                           num_blocks, 1, 1,
-			                           num_threads_per_block, 1, 1,
-			                           0, 0, args, 0));
-
-			cuda_assert(cuCtxSynchronize());
-
-			/* Update progress. */
-			rtile.sample = sample + wtile->num_samples;
-			task.update_progress(&rtile, rtile.w*rtile.h*wtile->num_samples);
-
-			if(task.get_cancel()) {
-				if(task.need_finish_queue == false)
-					break;
-			}
-		}
-	}
-
-	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
-	{
-		if(have_error())
-			return;
-
-		CUDAContextScope scope(this);
-
-		CUfunction cuFilmConvert;
-		CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
-		CUdeviceptr d_buffer = cuda_device_ptr(buffer);
-
-		/* get kernel function */
-		if(rgba_half) {
-			cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
-		}
-		else {
-			cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
-		}
-
-
-		float sample_scale = 1.0f/(task.sample + 1);
-
-		/* pass in parameters */
-		void *args[] = {&d_rgba,
-		                &d_buffer,
-		                &sample_scale,
-		                &task.x,
-		                &task.y,
-		                &task.w,
-		                &task.h,
-		                &task.offset,
-		                &task.stride};
-
-		/* launch kernel */
-		int threads_per_block;
-		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
-		int xthreads = (int)sqrt(threads_per_block);
-		int ythreads = (int)sqrt(threads_per_block);
-		int xblocks = (task.w + xthreads - 1)/xthreads;
-		int yblocks = (task.h + ythreads - 1)/ythreads;
-
-		cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
-		cuda_assert(cuLaunchKernel(cuFilmConvert,
-		                           xblocks , yblocks, 1, /* blocks */
-		                           xthreads, ythreads, 1, /* threads */
-		                           0, 0, args, 0));
-
-		unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
-
-		cuda_assert(cuCtxSynchronize());
-	}
-
-	void shader(DeviceTask& task)
-	{
-		if(have_error())
-			return;
-
-		CUDAContextScope scope(this);
-
-		CUfunction cuShader;
-		CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
-		CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
-
-		/* get kernel function */
-		if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
-			cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
-		}
-		else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-			cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
-		}
-		else {
-			cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
-		}
-
-		/* do tasks in smaller chunks, so we can cancel it */
-		const int shader_chunk_size = 65536;
-		const int start = task.shader_x;
-		const int end = task.shader_x + task.shader_w;
-		int offset = task.offset;
-
-		bool canceled = false;
-		for(int sample = 0; sample < task.num_samples && !canceled; sample++) {
-			for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-				int shader_w = min(shader_chunk_size, end - shader_x);
-
-				/* pass in parameters */
-				void *args[8];
-				int arg = 0;
-				args[arg++] = &d_input;
-				args[arg++] = &d_output;
-				args[arg++] = &task.shader_eval_type;
-				if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
-					args[arg++] = &task.shader_filter;
-				}
-				args[arg++] = &shader_x;
-				args[arg++] = &shader_w;
-				args[arg++] = &offset;
-				args[arg++] = &sample;
-
-				/* launch kernel */
-				int threads_per_block;
-				cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
-				int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
-
-				cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-				cuda_assert(cuLaunchKernel(cuShader,
-				                           xblocks , 1, 1, /* blocks */
-				                           threads_per_block, 1, 1, /* threads */
-				                           0, 0, args, 0));
-
-				cuda_assert(cuCtxSynchronize());
-
-				if(task.get_cancel()) {
-					canceled = true;
-					break;
-				}
-			}
-
-			task.update_progress(NULL);
-		}
-	}
-
-	CUdeviceptr map_pixels(device_ptr mem)
-	{
-		if(!background) {
-			PixelMem pmem = pixel_mem_map[mem];
-			CUdeviceptr buffer;
-
-			size_t bytes;
-			cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
-			cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
-			return buffer;
-		}
-
-		return cuda_device_ptr(mem);
-	}
-
-	void unmap_pixels(device_ptr mem)
-	{
-		if(!background) {
-			PixelMem pmem = pixel_mem_map[mem];
-
-			cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
-		}
-	}
-
-	void pixels_alloc(device_memory& mem)
-	{
-		PixelMem pmem;
-
-		pmem.w = mem.data_width;
-		pmem.h = mem.data_height;
-
-		CUDAContextScope scope(this);
-
-		glGenBuffers(1, &pmem.cuPBO);
-		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-		if(mem.data_type == TYPE_HALF)
-			glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
-		else
-			glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
-
-		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-		glActiveTexture(GL_TEXTURE0);
-		glGenTextures(1, &pmem.cuTexId);
-		glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-		if(mem.data_type == TYPE_HALF)
-			glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-		else
-			glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-		glBindTexture(GL_TEXTURE_2D, 0);
-
-		CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
-		if(result == CUDA_SUCCESS) {
-			mem.device_pointer = pmem.cuTexId;
-			pixel_mem_map[mem.device_pointer] = pmem;
-
-			mem.device_size = mem.memory_size();
-			stats.mem_alloc(mem.device_size);
-
-			return;
-		}
-		else {
-			/* failed to register buffer, fallback to no interop */
-			glDeleteBuffers(1, &pmem.cuPBO);
-			glDeleteTextures(1, &pmem.cuTexId);
-
-			background = true;
-		}
-	}
-
-	void pixels_copy_from(device_memory& mem, int y, int w, int h)
-	{
-		PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-		CUDAContextScope scope(this);
-
-		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-		uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-		size_t offset = sizeof(uchar)*4*y*w;
-		memcpy((uchar*)mem.host_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
-		glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-	}
-
-	void pixels_free(device_memory& mem)
-	{
-		if(mem.device_pointer) {
-			PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-			CUDAContextScope scope(this);
-
-			cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-			glDeleteBuffers(1, &pmem.cuPBO);
-			glDeleteTextures(1, &pmem.cuTexId);
-
-			pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-			mem.device_pointer = 0;
-
-			stats.mem_free(mem.device_size);
-			mem.device_size = 0;
-		}
-	}
-
-	void draw_pixels(
-	    device_memory& mem, int y,
-	    int w, int h, int width, int height,
-	    int dx, int dy, int dw, int dh, bool transparent,
-		const DeviceDrawParams &draw_params)
-	{
-		assert(mem.type == MEM_PIXELS);
-
-		if(!background) {
-			const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-			PixelMem pmem = pixel_mem_map[mem.device_pointer];
-			float *vpointer;
-
-			CUDAContextScope scope(this);
-
-			/* for multi devices, this assumes the inefficient method that we allocate
-			 * all pixels on the device even though we only render to a subset */
-			size_t offset = 4*y*w;
-
-			if(mem.data_type == TYPE_HALF)
-				offset *= sizeof(GLhalf);
-			else
-				offset *= sizeof(uint8_t);
-
-			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-			glActiveTexture(GL_TEXTURE0);
-			glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-			if(mem.data_type == TYPE_HALF) {
-				glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset);
-			}
-			else {
-				glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
-			}
-			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-			if(transparent) {
-				glEnable(GL_BLEND);
-				glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-			}
-
-			GLint shader_program;
-			if(use_fallback_shader) {
-				if(!bind_fallback_display_space_shader(dw, dh)) {
-					return;
-				}
-				shader_program = fallback_shader_program;
-			}
-			else {
-				draw_params.bind_display_space_shader_cb();
-				glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-			}
-
-			if(!vertex_buffer) {
-				glGenBuffers(1, &vertex_buffer);
-			}
-
-			glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-			/* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
-			glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-			vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-			if(vpointer) {
-				/* texture coordinate - vertex pair */
-				vpointer[0] = 0.0f;
-				vpointer[1] = 0.0f;
-				vpointer[2] = dx;
-				vpointer[3] = dy;
-
-				vpointer[4] = (float)w/(float)pmem.w;
-				vpointer[5] = 0.0f;
-				vpointer[6] = (float)width + dx;
-				vpointer[7] = dy;
-
-				vpointer[8] = (float)w/(float)pmem.w;
-				vpointer[9] = (float)h/(float)pmem.h;
-				vpointer[10] = (float)width + dx;
-				vpointer[11] = (float)height + dy;
-
-				vpointer[12] = 0.0f;
-				vpointer[13] = (float)h/(float)pmem.h;
-				vpointer[14] = dx;
-				vpointer[15] = (float)height + dy;
-
-				glUnmapBuffer(GL_ARRAY_BUFFER);
-			}
-
-			GLuint vertex_array_object;
-			GLuint position_attribute, texcoord_attribute;
-
-			glGenVertexArrays(1, &vertex_array_object);
-			glBindVertexArray(vertex_array_object);
-
-			texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-			position_attribute = glGetAttribLocation(shader_program, "pos");
-
-			glEnableVertexAttribArray(texcoord_attribute);
-			glEnableVertexAttribArray(position_attribute);
-
-			glVertexAttribPointer(texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-			glVertexAttribPointer(position_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)(sizeof(float) * 2));
-
-			glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-			if(use_fallback_shader) {
-				glUseProgram(0);
-			}
-			else {
-				draw_params.unbind_display_space_shader_cb();
-			}
-
-			if(transparent) {
-				glDisable(GL_BLEND);
-			}
-
-			glBindTexture(GL_TEXTURE_2D, 0);
-
-			return;
-		}
-
-		Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-	}
-
-	void thread_run(DeviceTask *task)
-	{
-		CUDAContextScope scope(this);
-
-		if(task->type == DeviceTask::RENDER) {
-			DeviceRequestedFeatures requested_features;
-			if(use_split_kernel()) {
-				if(split_kernel == NULL) {
-					split_kernel = new CUDASplitKernel(this);
-					split_kernel->load_kernels(requested_features);
-				}
-			}
-
-			device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-			/* keep rendering tiles until done */
-			RenderTile tile;
-			DenoisingTask denoising(this, *task);
-
-			while(task->acquire_tile(this, tile)) {
-				if(tile.task == RenderTile::PATH_TRACE) {
-					if(use_split_kernel()) {
-						device_only_memory<uchar> void_buffer(this, "void_buffer");
-						split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-					}
-					else {
-						path_trace(*task, tile, work_tiles);
-					}
-				}
-				else if(tile.task == RenderTile::DENOISE) {
-					tile.sample = tile.start_sample + tile.num_samples;
-
-					denoise(tile, denoising);
-
-					task->update_progress(&tile, tile.w*tile.h);
-				}
-
-				task->release_tile(tile);
-
-				if(task->get_cancel()) {
-					if(task->need_finish_queue == false)
-						break;
-				}
-			}
-
-			work_tiles.free();
-		}
-		else if(task->type == DeviceTask::SHADER) {
-			shader(*task);
-
-			cuda_assert(cuCtxSynchronize());
-		}
-	}
-
-	class CUDADeviceTask : public DeviceTask {
-	public:
-		CUDADeviceTask(CUDADevice *device, DeviceTask& task)
-		: DeviceTask(task)
-		{
-			run = function_bind(&CUDADevice::thread_run, device, this);
-		}
-	};
-
-	int get_split_task_count(DeviceTask& /*task*/)
-	{
-		return 1;
-	}
-
-	void task_add(DeviceTask& task)
-	{
-		CUDAContextScope scope(this);
-
-		/* Load texture info. */
-		load_texture_info();
-
-		/* Synchronize all memory copies before executing task. */
-		cuda_assert(cuCtxSynchronize());
-
-		if(task.type == DeviceTask::FILM_CONVERT) {
-			/* must be done in main thread due to opengl access */
-			film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-		}
-		else {
-			task_pool.push(new CUDADeviceTask(this, task));
-		}
-	}
-
-	void task_wait()
-	{
-		task_pool.wait();
-	}
-
-	void task_cancel()
-	{
-		task_pool.cancel();
-	}
-
-	friend class CUDASplitKernelFunction;
-	friend class CUDASplitKernel;
-	friend class CUDAContextScope;
+#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
+  int threads_per_block; \
+  cuda_assert( \
+      cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+  int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
+  int yblocks = h;
+
+#define CUDA_LAUNCH_KERNEL_1D(func, args) \
+  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
+
+  bool denoising_non_local_means(device_ptr image_ptr,
+                                 device_ptr guide_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr out_ptr,
+                                 DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    int stride = task->buffer.stride;
+    int w = task->buffer.width;
+    int h = task->buffer.h;
+    int r = task->nlm_state.r;
+    int f = task->nlm_state.f;
+    float a = task->nlm_state.a;
+    float k_2 = task->nlm_state.k_2;
+
+    int pass_stride = task->buffer.pass_stride;
+    int num_shifts = (2 * r + 1) * (2 * r + 1);
+    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+    int frame_offset = 0;
+
+    if (have_error())
+      return false;
+
+    CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+    CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
+    CUdeviceptr scale_ptr = 0;
+
+    cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
+    cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
+
+    {
+      CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+      cuda_assert(cuModuleGetFunction(
+          &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+      cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+      cuda_assert(cuModuleGetFunction(
+          &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+      cuda_assert(cuModuleGetFunction(
+          &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+
+      cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+
+      CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
+
+      void *calc_difference_args[] = {&guide_ptr,
+                                      &variance_ptr,
+                                      &scale_ptr,
+                                      &difference,
+                                      &w,
+                                      &h,
+                                      &stride,
+                                      &pass_stride,
+                                      &r,
+                                      &channel_offset,
+                                      &frame_offset,
+                                      &a,
+                                      &k_2};
+      void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+      void *calc_weight_args[] = {
+          &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+      void *update_output_args[] = {&blurDifference,
+                                    &image_ptr,
+                                    &out_ptr,
+                                    &weightAccum,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &channel_offset,
+                                    &r,
+                                    &f};
+
+      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+    }
+
+    {
+      CUfunction cuNLMNormalize;
+      cuda_assert(cuModuleGetFunction(
+          &cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+      cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+      void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+      CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+      CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+      cuda_assert(cuCtxSynchronize());
+    }
+
+    return !have_error();
+  }
+
+  bool denoising_construct_transform(DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterConstructTransform;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+    CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
+
+    void *args[] = {&task->buffer.mem.device_pointer,
+                    &task->tile_info_mem.device_pointer,
+                    &task->storage.transform.device_pointer,
+                    &task->storage.rank.device_pointer,
+                    &task->filter_area,
+                    &task->rect,
+                    &task->radius,
+                    &task->pca_threshold,
+                    &task->buffer.pass_stride,
+                    &task->buffer.frame_stride,
+                    &task->buffer.use_time};
+    CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_accumulate(device_ptr color_ptr,
+                            device_ptr color_variance_ptr,
+                            device_ptr scale_ptr,
+                            int frame,
+                            DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    int r = task->radius;
+    int f = 4;
+    float a = 1.0f;
+    float k_2 = task->nlm_k_2;
+
+    int w = task->reconstruction_state.source_w;
+    int h = task->reconstruction_state.source_h;
+    int stride = task->buffer.stride;
+    int frame_offset = frame * task->buffer.frame_stride;
+    int t = task->tile_info->frames[frame];
+
+    int pass_stride = task->buffer.pass_stride;
+    int num_shifts = (2 * r + 1) * (2 * r + 1);
+
+    if (have_error())
+      return false;
+
+    CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+
+    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+                          task->reconstruction_state.source_w *
+                              task->reconstruction_state.source_h,
+                          num_shifts);
+
+    void *calc_difference_args[] = {&color_ptr,
+                                    &color_variance_ptr,
+                                    &scale_ptr,
+                                    &difference,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &r,
+                                    &pass_stride,
+                                    &frame_offset,
+                                    &a,
+                                    &k_2};
+    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *calc_weight_args[] = {
+        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *construct_gramian_args[] = {&t,
+                                      &blurDifference,
+                                      &task->buffer.mem.device_pointer,
+                                      &task->storage.transform.device_pointer,
+                                      &task->storage.rank.device_pointer,
+                                      &task->storage.XtWX.device_pointer,
+                                      &task->storage.XtWY.device_pointer,
+                                      &task->reconstruction_state.filter_window,
+                                      &w,
+                                      &h,
+                                      &stride,
+                                      &pass_stride,
+                                      &r,
+                                      &f,
+                                      &frame_offset,
+                                      &task->buffer.use_time};
+
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
+  {
+    CUfunction cuFinalize;
+    cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+    cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+    void *finalize_args[] = {&output_ptr,
+                             &task->storage.rank.device_pointer,
+                             &task->storage.XtWX.device_pointer,
+                             &task->storage.XtWY.device_pointer,
+                             &task->filter_area,
+                             &task->reconstruction_state.buffer_params.x,
+                             &task->render_buffer.samples};
+    CUDA_GET_BLOCKSIZE(
+        cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
+    CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_combine_halves(device_ptr a_ptr,
+                                device_ptr b_ptr,
+                                device_ptr mean_ptr,
+                                device_ptr variance_ptr,
+                                int r,
+                                int4 rect,
+                                DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterCombineHalves;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
+    CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_divide_shadow(device_ptr a_ptr,
+                               device_ptr b_ptr,
+                               device_ptr sample_variance_ptr,
+                               device_ptr sv_variance_ptr,
+                               device_ptr buffer_variance_ptr,
+                               DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterDivideShadow;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&task->render_buffer.samples,
+                    &task->tile_info_mem.device_pointer,
+                    &a_ptr,
+                    &b_ptr,
+                    &sample_variance_ptr,
+                    &sv_variance_ptr,
+                    &buffer_variance_ptr,
+                    &task->rect,
+                    &task->render_buffer.pass_stride,
+                    &task->render_buffer.offset};
+    CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_get_feature(int mean_offset,
+                             int variance_offset,
+                             device_ptr mean_ptr,
+                             device_ptr variance_ptr,
+                             float scale,
+                             DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterGetFeature;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&task->render_buffer.samples,
+                    &task->tile_info_mem.device_pointer,
+                    &mean_offset,
+                    &variance_offset,
+                    &mean_ptr,
+                    &variance_ptr,
+                    &scale,
+                    &task->rect,
+                    &task->render_buffer.pass_stride,
+                    &task->render_buffer.offset};
+    CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_write_feature(int out_offset,
+                               device_ptr from_ptr,
+                               device_ptr buffer_ptr,
+                               DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterWriteFeature;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
+
+    void *args[] = {&task->render_buffer.samples,
+                    &task->reconstruction_state.buffer_params,
+                    &task->filter_area,
+                    &from_ptr,
+                    &buffer_ptr,
+                    &out_offset,
+                    &task->rect};
+    CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_detect_outliers(device_ptr image_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr depth_ptr,
+                                 device_ptr output_ptr,
+                                 DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterDetectOutliers;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&image_ptr,
+                    &variance_ptr,
+                    &depth_ptr,
+                    &output_ptr,
+                    &task->rect,
+                    &task->buffer.pass_stride};
+
+    CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  void denoise(RenderTile &rtile, DenoisingTask &denoising)
+  {
+    denoising.functions.construct_transform = function_bind(
+        &CUDADevice::denoising_construct_transform, this, &denoising);
+    denoising.functions.accumulate = function_bind(
+        &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+    denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
+    denoising.functions.divide_shadow = function_bind(
+        &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+    denoising.functions.non_local_means = function_bind(
+        &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+    denoising.functions.combine_halves = function_bind(
+        &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+    denoising.functions.get_feature = function_bind(
+        &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+    denoising.functions.write_feature = function_bind(
+        &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
+    denoising.functions.detect_outliers = function_bind(
+        &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+    denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+    denoising.render_buffer.samples = rtile.sample;
+    denoising.buffer.gpu_temporary_mem = true;
+
+    denoising.run_denoising(&rtile);
+  }
+
+  void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
+  {
+    scoped_timer timer(&rtile.buffers->render_time);
+
+    if (have_error())
+      return;
+
+    CUDAContextScope scope(this);
+    CUfunction cuPathTrace;
+
+    /* Get kernel function. */
+    if (task.integrator_branched) {
+      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+    }
+    else {
+      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+    }
+
+    if (have_error()) {
+      return;
+    }
+
+    cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+    /* Allocate work tile. */
+    work_tiles.alloc(1);
+
+    WorkTile *wtile = work_tiles.data();
+    wtile->x = rtile.x;
+    wtile->y = rtile.y;
+    wtile->w = rtile.w;
+    wtile->h = rtile.h;
+    wtile->offset = rtile.offset;
+    wtile->stride = rtile.stride;
+    wtile->buffer = (float *)cuda_device_ptr(rtile.buffer);
+
+    /* Prepare work size. More step samples render faster, but for now we
+     * remain conservative for GPUs connected to a display to avoid driver
+     * timeouts and display freezing. */
+    int min_blocks, num_threads_per_block;
+    cuda_assert(cuOccupancyMaxPotentialBlockSize(
+        &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+    if (!info.display_device) {
+      min_blocks *= 8;
+    }
+
+    uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+    /* Render all samples. */
+    int start_sample = rtile.start_sample;
+    int end_sample = rtile.start_sample + rtile.num_samples;
+
+    for (int sample = start_sample; sample < end_sample; sample += step_samples) {
+      /* Setup and copy work tile to device. */
+      wtile->start_sample = sample;
+      wtile->num_samples = min(step_samples, end_sample - sample);
+      work_tiles.copy_to_device();
+
+      CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
+      uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+      uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+      /* Launch kernel. */
+      void *args[] = {&d_work_tiles, &total_work_size};
+
+      cuda_assert(cuLaunchKernel(
+          cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+      cuda_assert(cuCtxSynchronize());
+
+      /* Update progress. */
+      rtile.sample = sample + wtile->num_samples;
+      task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+  }
+
+  void film_convert(DeviceTask &task,
+                    device_ptr buffer,
+                    device_ptr rgba_byte,
+                    device_ptr rgba_half)
+  {
+    if (have_error())
+      return;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilmConvert;
+    CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
+    CUdeviceptr d_buffer = cuda_device_ptr(buffer);
+
+    /* get kernel function */
+    if (rgba_half) {
+      cuda_assert(
+          cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
+    }
+    else {
+      cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
+    }
+
+    float sample_scale = 1.0f / (task.sample + 1);
+
+    /* pass in parameters */
+    void *args[] = {&d_rgba,
+                    &d_buffer,
+                    &sample_scale,
+                    &task.x,
+                    &task.y,
+                    &task.w,
+                    &task.h,
+                    &task.offset,
+                    &task.stride};
+
+    /* launch kernel */
+    int threads_per_block;
+    cuda_assert(cuFuncGetAttribute(
+        &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
+
+    int xthreads = (int)sqrt(threads_per_block);
+    int ythreads = (int)sqrt(threads_per_block);
+    int xblocks = (task.w + xthreads - 1) / xthreads;
+    int yblocks = (task.h + ythreads - 1) / ythreads;
+
+    cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
+
+    cuda_assert(cuLaunchKernel(cuFilmConvert,
+                               xblocks,
+                               yblocks,
+                               1, /* blocks */
+                               xthreads,
+                               ythreads,
+                               1, /* threads */
+                               0,
+                               0,
+                               args,
+                               0));
+
+    unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
+
+    cuda_assert(cuCtxSynchronize());
+  }
+
+  void shader(DeviceTask &task)
+  {
+    if (have_error())
+      return;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuShader;
+    CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
+    CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
+
+    /* get kernel function */
+    if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
+    }
+    else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
+    }
+    else {
+      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
+    }
+
+    /* do tasks in smaller chunks, so we can cancel it */
+    const int shader_chunk_size = 65536;
+    const int start = task.shader_x;
+    const int end = task.shader_x + task.shader_w;
+    int offset = task.offset;
+
+    bool canceled = false;
+    for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
+      for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
+        int shader_w = min(shader_chunk_size, end - shader_x);
+
+        /* pass in parameters */
+        void *args[8];
+        int arg = 0;
+        args[arg++] = &d_input;
+        args[arg++] = &d_output;
+        args[arg++] = &task.shader_eval_type;
+        if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+          args[arg++] = &task.shader_filter;
+        }
+        args[arg++] = &shader_x;
+        args[arg++] = &shader_w;
+        args[arg++] = &offset;
+        args[arg++] = &sample;
+
+        /* launch kernel */
+        int threads_per_block;
+        cuda_assert(cuFuncGetAttribute(
+            &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+
+        int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
+
+        cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+        cuda_assert(cuLaunchKernel(cuShader,
+                                   xblocks,
+                                   1,
+                                   1, /* blocks */
+                                   threads_per_block,
+                                   1,
+                                   1, /* threads */
+                                   0,
+                                   0,
+                                   args,
+                                   0));
+
+        cuda_assert(cuCtxSynchronize());
+
+        if (task.get_cancel()) {
+          canceled = true;
+          break;
+        }
+      }
+
+      task.update_progress(NULL);
+    }
+  }
+
+  CUdeviceptr map_pixels(device_ptr mem)
+  {
+    if (!background) {
+      PixelMem pmem = pixel_mem_map[mem];
+      CUdeviceptr buffer;
+
+      size_t bytes;
+      cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
+      cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
+
+      return buffer;
+    }
+
+    return cuda_device_ptr(mem);
+  }
+
+  void unmap_pixels(device_ptr mem)
+  {
+    if (!background) {
+      PixelMem pmem = pixel_mem_map[mem];
+
+      cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
+    }
+  }
+
+  void pixels_alloc(device_memory &mem)
+  {
+    PixelMem pmem;
+
+    pmem.w = mem.data_width;
+    pmem.h = mem.data_height;
+
+    CUDAContextScope scope(this);
+
+    glGenBuffers(1, &pmem.cuPBO);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+    if (mem.data_type == TYPE_HALF)
+      glBufferData(
+          GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
+    else
+      glBufferData(
+          GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
+
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    glActiveTexture(GL_TEXTURE0);
+    glGenTextures(1, &pmem.cuTexId);
+    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+    if (mem.data_type == TYPE_HALF)
+      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
+    else
+      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    CUresult result = cuGraphicsGLRegisterBuffer(
+        &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+
+    if (result == CUDA_SUCCESS) {
+      mem.device_pointer = pmem.cuTexId;
+      pixel_mem_map[mem.device_pointer] = pmem;
+
+      mem.device_size = mem.memory_size();
+      stats.mem_alloc(mem.device_size);
+
+      return;
+    }
+    else {
+      /* failed to register buffer, fallback to no interop */
+      glDeleteBuffers(1, &pmem.cuPBO);
+      glDeleteTextures(1, &pmem.cuTexId);
+
+      background = true;
+    }
+  }
+
+  void pixels_copy_from(device_memory &mem, int y, int w, int h)
+  {
+    PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+    CUDAContextScope scope(this);
+
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+    uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
+    size_t offset = sizeof(uchar) * 4 * y * w;
+    memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
+    glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+  }
+
+  void pixels_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+      CUDAContextScope scope(this);
+
+      cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
+      glDeleteBuffers(1, &pmem.cuPBO);
+      glDeleteTextures(1, &pmem.cuTexId);
+
+      pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
+      mem.device_pointer = 0;
+
+      stats.mem_free(mem.device_size);
+      mem.device_size = 0;
+    }
+  }
+
+  void draw_pixels(device_memory &mem,
+                   int y,
+                   int w,
+                   int h,
+                   int width,
+                   int height,
+                   int dx,
+                   int dy,
+                   int dw,
+                   int dh,
+                   bool transparent,
+                   const DeviceDrawParams &draw_params)
+  {
+    assert(mem.type == MEM_PIXELS);
+
+    if (!background) {
+      const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
+      PixelMem pmem = pixel_mem_map[mem.device_pointer];
+      float *vpointer;
+
+      CUDAContextScope scope(this);
+
+      /* for multi devices, this assumes the inefficient method that we allocate
+       * all pixels on the device even though we only render to a subset */
+      size_t offset = 4 * y * w;
+
+      if (mem.data_type == TYPE_HALF)
+        offset *= sizeof(GLhalf);
+      else
+        offset *= sizeof(uint8_t);
+
+      glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+      glActiveTexture(GL_TEXTURE0);
+      glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+      if (mem.data_type == TYPE_HALF) {
+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
+      }
+      else {
+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
+      }
+      glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+      if (transparent) {
+        glEnable(GL_BLEND);
+        glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+      }
+
+      GLint shader_program;
+      if (use_fallback_shader) {
+        if (!bind_fallback_display_space_shader(dw, dh)) {
+          return;
+        }
+        shader_program = fallback_shader_program;
+      }
+      else {
+        draw_params.bind_display_space_shader_cb();
+        glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
+      }
+
+      if (!vertex_buffer) {
+        glGenBuffers(1, &vertex_buffer);
+      }
+
+      glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+      /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
+      glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+      vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+
+      if (vpointer) {
+        /* texture coordinate - vertex pair */
+        vpointer[0] = 0.0f;
+        vpointer[1] = 0.0f;
+        vpointer[2] = dx;
+        vpointer[3] = dy;
+
+        vpointer[4] = (float)w / (float)pmem.w;
+        vpointer[5] = 0.0f;
+        vpointer[6] = (float)width + dx;
+        vpointer[7] = dy;
+
+        vpointer[8] = (float)w / (float)pmem.w;
+        vpointer[9] = (float)h / (float)pmem.h;
+        vpointer[10] = (float)width + dx;
+        vpointer[11] = (float)height + dy;
+
+        vpointer[12] = 0.0f;
+        vpointer[13] = (float)h / (float)pmem.h;
+        vpointer[14] = dx;
+        vpointer[15] = (float)height + dy;
+
+        glUnmapBuffer(GL_ARRAY_BUFFER);
+      }
+
+      GLuint vertex_array_object;
+      GLuint position_attribute, texcoord_attribute;
+
+      glGenVertexArrays(1, &vertex_array_object);
+      glBindVertexArray(vertex_array_object);
+
+      texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
+      position_attribute = glGetAttribLocation(shader_program, "pos");
+
+      glEnableVertexAttribArray(texcoord_attribute);
+      glEnableVertexAttribArray(position_attribute);
+
+      glVertexAttribPointer(
+          texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+      glVertexAttribPointer(position_attribute,
+                            2,
+                            GL_FLOAT,
+                            GL_FALSE,
+                            4 * sizeof(float),
+                            (const GLvoid *)(sizeof(float) * 2));
+
+      glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+      if (use_fallback_shader) {
+        glUseProgram(0);
+      }
+      else {
+        draw_params.unbind_display_space_shader_cb();
+      }
+
+      if (transparent) {
+        glDisable(GL_BLEND);
+      }
+
+      glBindTexture(GL_TEXTURE_2D, 0);
+
+      return;
+    }
+
+    Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
+  }
+
+  void thread_run(DeviceTask *task)
+  {
+    CUDAContextScope scope(this);
+
+    if (task->type == DeviceTask::RENDER) {
+      DeviceRequestedFeatures requested_features;
+      if (use_split_kernel()) {
+        if (split_kernel == NULL) {
+          split_kernel = new CUDASplitKernel(this);
+          split_kernel->load_kernels(requested_features);
+        }
+      }
+
+      device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+      /* keep rendering tiles until done */
+      RenderTile tile;
+      DenoisingTask denoising(this, *task);
+
+      while (task->acquire_tile(this, tile)) {
+        if (tile.task == RenderTile::PATH_TRACE) {
+          if (use_split_kernel()) {
+            device_only_memory<uchar> void_buffer(this, "void_buffer");
+            split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+          }
+          else {
+            path_trace(*task, tile, work_tiles);
+          }
+        }
+        else if (tile.task == RenderTile::DENOISE) {
+          tile.sample = tile.start_sample + tile.num_samples;
+
+          denoise(tile, denoising);
+
+          task->update_progress(&tile, tile.w * tile.h);
+        }
+
+        task->release_tile(tile);
+
+        if (task->get_cancel()) {
+          if (task->need_finish_queue == false)
+            break;
+        }
+      }
+
+      work_tiles.free();
+    }
+    else if (task->type == DeviceTask::SHADER) {
+      shader(*task);
+
+      cuda_assert(cuCtxSynchronize());
+    }
+  }
+
+  class CUDADeviceTask : public DeviceTask {
+   public:
+    CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task)
+    {
+      run = function_bind(&CUDADevice::thread_run, device, this);
+    }
+  };
+
+  int get_split_task_count(DeviceTask & /*task*/)
+  {
+    return 1;
+  }
+
+  void task_add(DeviceTask &task)
+  {
+    CUDAContextScope scope(this);
+
+    /* Load texture info. */
+    load_texture_info();
+
+    /* Synchronize all memory copies before executing task. */
+    cuda_assert(cuCtxSynchronize());
+
+    if (task.type == DeviceTask::FILM_CONVERT) {
+      /* must be done in main thread due to opengl access */
+      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
+    }
+    else {
+      task_pool.push(new CUDADeviceTask(this, task));
+    }
+  }
+
+  void task_wait()
+  {
+    task_pool.wait();
+  }
+
+  void task_cancel()
+  {
+    task_pool.cancel();
+  }
+
+  friend class CUDASplitKernelFunction;
+  friend class CUDASplitKernel;
+  friend class CUDAContextScope;
 };
 
 /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
@@ -2207,496 +2305,501 @@ public:
  */
 #undef cuda_assert
 #define cuda_assert(stmt) \
-	{ \
-		CUresult result = stmt; \
-		\
-		if(result != CUDA_SUCCESS) { \
-			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
-			if(device->error_msg == "") \
-				device->error_msg = message; \
-			fprintf(stderr, "%s\n", message.c_str()); \
-			/*cuda_abort();*/ \
-			device->cuda_error_documentation(); \
-		} \
-	} (void) 0
-
+  { \
+    CUresult result = stmt; \
+\
+    if (result != CUDA_SUCCESS) { \
+      string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+      if (device->error_msg == "") \
+        device->error_msg = message; \
+      fprintf(stderr, "%s\n", message.c_str()); \
+      /*cuda_abort();*/ \
+      device->cuda_error_documentation(); \
+    } \
+  } \
+  (void)0
 
 /* CUDA context scope. */
 
-CUDAContextScope::CUDAContextScope(CUDADevice *device)
-: device(device)
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
 {
-	cuda_assert(cuCtxPushCurrent(device->cuContext));
+  cuda_assert(cuCtxPushCurrent(device->cuContext));
 }
 
 CUDAContextScope::~CUDAContextScope()
 {
-	cuda_assert(cuCtxPopCurrent(NULL));
+  cuda_assert(cuCtxPopCurrent(NULL));
 }
 
 /* split kernel */
 
-class CUDASplitKernelFunction : public SplitKernelFunction{
-	CUDADevice* device;
-	CUfunction func;
-public:
-	CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
-
-	/* enqueue the kernel, returns false if there is an error */
-	bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
-	{
-		return enqueue(dim, NULL);
-	}
-
-	/* enqueue the kernel, returns false if there is an error */
-	bool enqueue(const KernelDimensions &dim, void *args[])
-	{
-		if(device->have_error())
-			return false;
-
-		CUDAContextScope scope(device);
-
-		/* we ignore dim.local_size for now, as this is faster */
-		int threads_per_block;
-		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-		int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
-
-		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-		cuda_assert(cuLaunchKernel(func,
-		                           xblocks, 1, 1, /* blocks */
-		                           threads_per_block, 1, 1, /* threads */
-		                           0, 0, args, 0));
-
-		return !device->have_error();
-	}
+class CUDASplitKernelFunction : public SplitKernelFunction {
+  CUDADevice *device;
+  CUfunction func;
+
+ public:
+  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
+  {
+  }
+
+  /* enqueue the kernel, returns false if there is an error */
+  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
+  {
+    return enqueue(dim, NULL);
+  }
+
+  /* enqueue the kernel, returns false if there is an error */
+  bool enqueue(const KernelDimensions &dim, void *args[])
+  {
+    if (device->have_error())
+      return false;
+
+    CUDAContextScope scope(device);
+
+    /* we ignore dim.local_size for now, as this is faster */
+    int threads_per_block;
+    cuda_assert(
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
+                  threads_per_block;
+
+    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+    cuda_assert(cuLaunchKernel(func,
+                               xblocks,
+                               1,
+                               1, /* blocks */
+                               threads_per_block,
+                               1,
+                               1, /* threads */
+                               0,
+                               0,
+                               args,
+                               0));
+
+    return !device->have_error();
+  }
 };
 
 CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
 {
 }
 
-uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
+                                            device_memory & /*data*/,
+                                            size_t num_threads)
 {
-	CUDAContextScope scope(device);
+  CUDAContextScope scope(device);
 
-	device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-	size_buffer.alloc(1);
-	size_buffer.zero_to_device();
+  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
+  size_buffer.alloc(1);
+  size_buffer.zero_to_device();
 
-	uint threads = num_threads;
-	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+  uint threads = num_threads;
+  CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
 
-	struct args_t {
-		uint* num_threads;
-		CUdeviceptr* size;
-	};
+  struct args_t {
+    uint *num_threads;
+    CUdeviceptr *size;
+  };
 
-	args_t args = {
-		&threads,
-		&d_size
-	};
+  args_t args = {&threads, &d_size};
 
-	CUfunction state_buffer_size;
-	cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+  CUfunction state_buffer_size;
+  cuda_assert(
+      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
 
-	cuda_assert(cuLaunchKernel(state_buffer_size,
-	                           1, 1, 1,
-	                           1, 1, 1,
-	                           0, 0, (void**)&args, 0));
+  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
 
-	size_buffer.copy_from_device(0, 1, 1);
-	size_t size = size_buffer[0];
-	size_buffer.free();
+  size_buffer.copy_from_device(0, 1, 1);
+  size_t size = size_buffer[0];
+  size_buffer.free();
 
-	return size;
+  return size;
 }
 
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
-                                    RenderTile& rtile,
-                                    int num_global_elements,
-                                    device_memory& /*kernel_globals*/,
-                                    device_memory& /*kernel_data*/,
-                                    device_memory& split_data,
-                                    device_memory& ray_state,
-                                    device_memory& queue_index,
-                                    device_memory& use_queues_flag,
-                                    device_memory& work_pool_wgs)
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                                     RenderTile &rtile,
+                                                     int num_global_elements,
+                                                     device_memory & /*kernel_globals*/,
+                                                     device_memory & /*kernel_data*/,
+                                                     device_memory &split_data,
+                                                     device_memory &ray_state,
+                                                     device_memory &queue_index,
+                                                     device_memory &use_queues_flag,
+                                                     device_memory &work_pool_wgs)
 {
-	CUDAContextScope scope(device);
-
-	CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
-	CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
-	CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
-	CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
-	CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
-
-	CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
-
-	int end_sample = rtile.start_sample + rtile.num_samples;
-	int queue_size = dim.global_size[0] * dim.global_size[1];
-
-	struct args_t {
-		CUdeviceptr* split_data_buffer;
-		int* num_elements;
-		CUdeviceptr* ray_state;
-		int* start_sample;
-		int* end_sample;
-		int* sx;
-		int* sy;
-		int* sw;
-		int* sh;
-		int* offset;
-		int* stride;
-		CUdeviceptr* queue_index;
-		int* queuesize;
-		CUdeviceptr* use_queues_flag;
-		CUdeviceptr* work_pool_wgs;
-		int* num_samples;
-		CUdeviceptr* buffer;
-	};
-
-	args_t args = {
-		&d_split_data,
-		&num_global_elements,
-		&d_ray_state,
-		&rtile.start_sample,
-		&end_sample,
-		&rtile.x,
-		&rtile.y,
-		&rtile.w,
-		&rtile.h,
-		&rtile.offset,
-		&rtile.stride,
-		&d_queue_index,
-		&queue_size,
-		&d_use_queues_flag,
-		&d_work_pool_wgs,
-		&rtile.num_samples,
-		&d_buffer
-	};
-
-	CUfunction data_init;
-	cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-	if(device->have_error()) {
-		return false;
-	}
-
-	CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
-
-	return !device->have_error();
+  CUDAContextScope scope(device);
+
+  CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+  CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+  CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+  CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+  CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+  CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+  int end_sample = rtile.start_sample + rtile.num_samples;
+  int queue_size = dim.global_size[0] * dim.global_size[1];
+
+  struct args_t {
+    CUdeviceptr *split_data_buffer;
+    int *num_elements;
+    CUdeviceptr *ray_state;
+    int *start_sample;
+    int *end_sample;
+    int *sx;
+    int *sy;
+    int *sw;
+    int *sh;
+    int *offset;
+    int *stride;
+    CUdeviceptr *queue_index;
+    int *queuesize;
+    CUdeviceptr *use_queues_flag;
+    CUdeviceptr *work_pool_wgs;
+    int *num_samples;
+    CUdeviceptr *buffer;
+  };
+
+  args_t args = {&d_split_data,
+                 &num_global_elements,
+                 &d_ray_state,
+                 &rtile.start_sample,
+                 &end_sample,
+                 &rtile.x,
+                 &rtile.y,
+                 &rtile.w,
+                 &rtile.h,
+                 &rtile.offset,
+                 &rtile.stride,
+                 &d_queue_index,
+                 &queue_size,
+                 &d_use_queues_flag,
+                 &d_work_pool_wgs,
+                 &rtile.num_samples,
+                 &d_buffer};
+
+  CUfunction data_init;
+  cuda_assert(
+      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+  if (device->have_error()) {
+    return false;
+  }
+
+  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
+
+  return !device->have_error();
 }
 
-SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
-                                                                const DeviceRequestedFeatures&)
+SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
+                                                                const DeviceRequestedFeatures &)
 {
-	CUDAContextScope scope(device);
-	CUfunction func;
-
-	cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
-	if(device->have_error()) {
-		device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
-		return NULL;
-	}
-
-	return new CUDASplitKernelFunction(device, func);
+  CUDAContextScope scope(device);
+  CUfunction func;
+
+  cuda_assert(
+      cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+  if (device->have_error()) {
+    device->cuda_error_message(
+        string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+    return NULL;
+  }
+
+  return new CUDASplitKernelFunction(device, func);
 }
 
 int2 CUDASplitKernel::split_kernel_local_size()
 {
-	return make_int2(32, 1);
+  return make_int2(32, 1);
 }
 
-int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
+int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
+                                               device_memory &data,
+                                               DeviceTask * /*task*/)
 {
-	CUDAContextScope scope(device);
-	size_t free;
-	size_t total;
+  CUDAContextScope scope(device);
+  size_t free;
+  size_t total;
 
-	cuda_assert(cuMemGetInfo(&free, &total));
+  cuda_assert(cuMemGetInfo(&free, &total));
 
-	VLOG(1) << "Maximum device allocation size: "
-	        << string_human_readable_number(free) << " bytes. ("
-	        << string_human_readable_size(free) << ").";
+  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
+          << " bytes. (" << string_human_readable_size(free) << ").";
 
-	size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-	size_t side = round_down((int)sqrt(num_elements), 32);
-	int2 global_size = make_int2(side, round_down(num_elements / side, 16));
-	VLOG(1) << "Global size: " << global_size << ".";
-	return global_size;
+  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
+  size_t side = round_down((int)sqrt(num_elements), 32);
+  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
+  VLOG(1) << "Global size: " << global_size << ".";
+  return global_size;
 }
 
 bool device_cuda_init()
 {
 #ifdef WITH_CUDA_DYNLOAD
-	static bool initialized = false;
-	static bool result = false;
-
-	if(initialized)
-		return result;
-
-	initialized = true;
-	int cuew_result = cuewInit(CUEW_INIT_CUDA);
-	if(cuew_result == CUEW_SUCCESS) {
-		VLOG(1) << "CUEW initialization succeeded";
-		if(CUDADevice::have_precompiled_kernels()) {
-			VLOG(1) << "Found precompiled kernels";
-			result = true;
-		}
-#ifndef _WIN32
-		else if(cuewCompilerPath() != NULL) {
-			VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
-			result = true;
-		}
-		else {
-			VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found,"
-			        << " unable to use CUDA";
-		}
-#endif
-	}
-	else {
-		VLOG(1) << "CUEW initialization failed: "
-		        << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED)
-		            ? "Error setting up atexit() handler"
-		            : "Error opening the library");
-	}
-
-	return result;
+  static bool initialized = false;
+  static bool result = false;
+
+  if (initialized)
+    return result;
+
+  initialized = true;
+  int cuew_result = cuewInit(CUEW_INIT_CUDA);
+  if (cuew_result == CUEW_SUCCESS) {
+    VLOG(1) << "CUEW initialization succeeded";
+    if (CUDADevice::have_precompiled_kernels()) {
+      VLOG(1) << "Found precompiled kernels";
+      result = true;
+    }
+#  ifndef _WIN32
+    else if (cuewCompilerPath() != NULL) {
+      VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
+      result = true;
+    }
+    else {
+      VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found,"
+              << " unable to use CUDA";
+    }
+#  endif
+  }
+  else {
+    VLOG(1) << "CUEW initialization failed: "
+            << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
+                                                            "Error opening the library");
+  }
+
+  return result;
 #else  /* WITH_CUDA_DYNLOAD */
-	return true;
-#endif  /* WITH_CUDA_DYNLOAD */
+  return true;
+#endif /* WITH_CUDA_DYNLOAD */
 }
 
-Device *device_cuda_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
 {
-	return new CUDADevice(info, stats, profiler, background);
+  return new CUDADevice(info, stats, profiler, background);
 }
 
 static CUresult device_cuda_safe_init()
 {
 #ifdef _WIN32
-	__try {
-		return cuInit(0);
-	}
-	__except(EXCEPTION_EXECUTE_HANDLER) {
-		/* Ignore crashes inside the CUDA driver and hope we can
-		 * survive even with corrupted CUDA installs. */
-		fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n");
-	}
-
-	return CUDA_ERROR_NO_DEVICE;
+  __try {
+    return cuInit(0);
+  }
+  __except (EXCEPTION_EXECUTE_HANDLER) {
+    /* Ignore crashes inside the CUDA driver and hope we can
+     * survive even with corrupted CUDA installs. */
+    fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n");
+  }
+
+  return CUDA_ERROR_NO_DEVICE;
 #else
-	return cuInit(0);
+  return cuInit(0);
 #endif
 }
 
-void device_cuda_info(vector<DeviceInfo>& devices)
+void device_cuda_info(vector<DeviceInfo> &devices)
 {
-	CUresult result = device_cuda_safe_init();
-	if(result != CUDA_SUCCESS) {
-		if(result != CUDA_ERROR_NO_DEVICE)
-			fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
-		return;
-	}
-
-	int count = 0;
-	result = cuDeviceGetCount(&count);
-	if(result != CUDA_SUCCESS) {
-		fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
-		return;
-	}
-
-	vector<DeviceInfo> display_devices;
-
-	for(int num = 0; num < count; num++) {
-		char name[256];
-
-		result = cuDeviceGetName(name, 256, num);
-		if(result != CUDA_SUCCESS) {
-			fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result));
-			continue;
-		}
-
-		int major;
-		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
-		if(major < 3) {
-			VLOG(1) << "Ignoring device \"" << name
-			        << "\", this graphics card is no longer supported.";
-			continue;
-		}
-
-		DeviceInfo info;
-
-		info.type = DEVICE_CUDA;
-		info.description = string(name);
-		info.num = num;
-
-		info.has_half_images = (major >= 3);
-		info.has_volume_decoupled = false;
-
-		int pci_location[3] = {0, 0, 0};
-		cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
-		cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
-		cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num);
-		info.id = string_printf("CUDA_%s_%04x:%02x:%02x",
-		                        name,
-		                        (unsigned int)pci_location[0],
-		                        (unsigned int)pci_location[1],
-		                        (unsigned int)pci_location[2]);
-
-		/* If device has a kernel timeout and no compute preemption, we assume
-		 * it is connected to a display and will freeze the display while doing
-		 * computations. */
-		int timeout_attr = 0, preempt_attr = 0;
-		cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num);
-		cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num);
-
-		if(timeout_attr && !preempt_attr) {
-			VLOG(1) << "Device is recognized as display.";
-			info.description += " (Display)";
-			info.display_device = true;
-			display_devices.push_back(info);
-		}
-		else {
-			devices.push_back(info);
-		}
-		VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
-	}
-
-	if(!display_devices.empty())
-		devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+  CUresult result = device_cuda_safe_init();
+  if (result != CUDA_SUCCESS) {
+    if (result != CUDA_ERROR_NO_DEVICE)
+      fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
+    return;
+  }
+
+  int count = 0;
+  result = cuDeviceGetCount(&count);
+  if (result != CUDA_SUCCESS) {
+    fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
+    return;
+  }
+
+  vector<DeviceInfo> display_devices;
+
+  for (int num = 0; num < count; num++) {
+    char name[256];
+
+    result = cuDeviceGetName(name, 256, num);
+    if (result != CUDA_SUCCESS) {
+      fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result));
+      continue;
+    }
+
+    int major;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
+    if (major < 3) {
+      VLOG(1) << "Ignoring device \"" << name << "\", this graphics card is no longer supported.";
+      continue;
+    }
+
+    DeviceInfo info;
+
+    info.type = DEVICE_CUDA;
+    info.description = string(name);
+    info.num = num;
+
+    info.has_half_images = (major >= 3);
+    info.has_volume_decoupled = false;
+
+    int pci_location[3] = {0, 0, 0};
+    cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
+    cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
+    cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num);
+    info.id = string_printf("CUDA_%s_%04x:%02x:%02x",
+                            name,
+                            (unsigned int)pci_location[0],
+                            (unsigned int)pci_location[1],
+                            (unsigned int)pci_location[2]);
+
+    /* If device has a kernel timeout and no compute preemption, we assume
+     * it is connected to a display and will freeze the display while doing
+     * computations. */
+    int timeout_attr = 0, preempt_attr = 0;
+    cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num);
+    cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num);
+
+    if (timeout_attr && !preempt_attr) {
+      VLOG(1) << "Device is recognized as display.";
+      info.description += " (Display)";
+      info.display_device = true;
+      display_devices.push_back(info);
+    }
+    else {
+      devices.push_back(info);
+    }
+    VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+  }
+
+  if (!display_devices.empty())
+    devices.insert(devices.end(), display_devices.begin(), display_devices.end());
 }
 
 string device_cuda_capabilities()
 {
-	CUresult result = device_cuda_safe_init();
-	if(result != CUDA_SUCCESS) {
-		if(result != CUDA_ERROR_NO_DEVICE) {
-			return string("Error initializing CUDA: ") + cuewErrorString(result);
-		}
-		return "No CUDA device found\n";
-	}
-
-	int count;
-	result = cuDeviceGetCount(&count);
-	if(result != CUDA_SUCCESS) {
-		return string("Error getting devices: ") + cuewErrorString(result);
-	}
-
-	string capabilities = "";
-	for(int num = 0; num < count; num++) {
-		char name[256];
-		if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) {
-			continue;
-		}
-		capabilities += string("\t") + name + "\n";
-		int value;
+  CUresult result = device_cuda_safe_init();
+  if (result != CUDA_SUCCESS) {
+    if (result != CUDA_ERROR_NO_DEVICE) {
+      return string("Error initializing CUDA: ") + cuewErrorString(result);
+    }
+    return "No CUDA device found\n";
+  }
+
+  int count;
+  result = cuDeviceGetCount(&count);
+  if (result != CUDA_SUCCESS) {
+    return string("Error getting devices: ") + cuewErrorString(result);
+  }
+
+  string capabilities = "";
+  for (int num = 0; num < count; num++) {
+    char name[256];
+    if (cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) {
+      continue;
+    }
+    capabilities += string("\t") + name + "\n";
+    int value;
 #define GET_ATTR(attr) \
-		{ \
-			if(cuDeviceGetAttribute(&value, \
-			                        CU_DEVICE_ATTRIBUTE_##attr, \
-			                        num) == CUDA_SUCCESS) \
-			{ \
-				capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", \
-				                              value); \
-			} \
-		} (void) 0
-		/* TODO(sergey): Strip all attributes which are not useful for us
-		 * or does not depend on the driver.
-		 */
-		GET_ATTR(MAX_THREADS_PER_BLOCK);
-		GET_ATTR(MAX_BLOCK_DIM_X);
-		GET_ATTR(MAX_BLOCK_DIM_Y);
-		GET_ATTR(MAX_BLOCK_DIM_Z);
-		GET_ATTR(MAX_GRID_DIM_X);
-		GET_ATTR(MAX_GRID_DIM_Y);
-		GET_ATTR(MAX_GRID_DIM_Z);
-		GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK);
-		GET_ATTR(SHARED_MEMORY_PER_BLOCK);
-		GET_ATTR(TOTAL_CONSTANT_MEMORY);
-		GET_ATTR(WARP_SIZE);
-		GET_ATTR(MAX_PITCH);
-		GET_ATTR(MAX_REGISTERS_PER_BLOCK);
-		GET_ATTR(REGISTERS_PER_BLOCK);
-		GET_ATTR(CLOCK_RATE);
-		GET_ATTR(TEXTURE_ALIGNMENT);
-		GET_ATTR(GPU_OVERLAP);
-		GET_ATTR(MULTIPROCESSOR_COUNT);
-		GET_ATTR(KERNEL_EXEC_TIMEOUT);
-		GET_ATTR(INTEGRATED);
-		GET_ATTR(CAN_MAP_HOST_MEMORY);
-		GET_ATTR(COMPUTE_MODE);
-		GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT);
-		GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT);
-		GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH);
-		GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT);
-		GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS);
-		GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT);
-		GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES);
-		GET_ATTR(SURFACE_ALIGNMENT);
-		GET_ATTR(CONCURRENT_KERNELS);
-		GET_ATTR(ECC_ENABLED);
-		GET_ATTR(TCC_DRIVER);
-		GET_ATTR(MEMORY_CLOCK_RATE);
-		GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH);
-		GET_ATTR(L2_CACHE_SIZE);
-		GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR);
-		GET_ATTR(ASYNC_ENGINE_COUNT);
-		GET_ATTR(UNIFIED_ADDRESSING);
-		GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS);
-		GET_ATTR(CAN_TEX2D_GATHER);
-		GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT);
-		GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE);
-		GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE);
-		GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE);
-		GET_ATTR(TEXTURE_PITCH_ALIGNMENT);
-		GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS);
-		GET_ATTR(MAXIMUM_SURFACE1D_WIDTH);
-		GET_ATTR(MAXIMUM_SURFACE2D_WIDTH);
-		GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT);
-		GET_ATTR(MAXIMUM_SURFACE3D_WIDTH);
-		GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT);
-		GET_ATTR(MAXIMUM_SURFACE3D_DEPTH);
-		GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH);
-		GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS);
-		GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH);
-		GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT);
-		GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS);
-		GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH);
-		GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH);
-		GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS);
-		GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT);
-		GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH);
-		GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH);
-		GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT);
-		GET_ATTR(COMPUTE_CAPABILITY_MAJOR);
-		GET_ATTR(COMPUTE_CAPABILITY_MINOR);
-		GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH);
-		GET_ATTR(STREAM_PRIORITIES_SUPPORTED);
-		GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED);
-		GET_ATTR(LOCAL_L1_CACHE_SUPPORTED);
-		GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
-		GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR);
-		GET_ATTR(MANAGED_MEMORY);
-		GET_ATTR(MULTI_GPU_BOARD);
-		GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
+  { \
+    if (cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_##attr, num) == CUDA_SUCCESS) { \
+      capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", value); \
+    } \
+  } \
+  (void)0
+    /* TODO(sergey): Strip all attributes which are not useful for us
+     * or does not depend on the driver.
+     */
+    GET_ATTR(MAX_THREADS_PER_BLOCK);
+    GET_ATTR(MAX_BLOCK_DIM_X);
+    GET_ATTR(MAX_BLOCK_DIM_Y);
+    GET_ATTR(MAX_BLOCK_DIM_Z);
+    GET_ATTR(MAX_GRID_DIM_X);
+    GET_ATTR(MAX_GRID_DIM_Y);
+    GET_ATTR(MAX_GRID_DIM_Z);
+    GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK);
+    GET_ATTR(SHARED_MEMORY_PER_BLOCK);
+    GET_ATTR(TOTAL_CONSTANT_MEMORY);
+    GET_ATTR(WARP_SIZE);
+    GET_ATTR(MAX_PITCH);
+    GET_ATTR(MAX_REGISTERS_PER_BLOCK);
+    GET_ATTR(REGISTERS_PER_BLOCK);
+    GET_ATTR(CLOCK_RATE);
+    GET_ATTR(TEXTURE_ALIGNMENT);
+    GET_ATTR(GPU_OVERLAP);
+    GET_ATTR(MULTIPROCESSOR_COUNT);
+    GET_ATTR(KERNEL_EXEC_TIMEOUT);
+    GET_ATTR(INTEGRATED);
+    GET_ATTR(CAN_MAP_HOST_MEMORY);
+    GET_ATTR(COMPUTE_MODE);
+    GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS);
+    GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES);
+    GET_ATTR(SURFACE_ALIGNMENT);
+    GET_ATTR(CONCURRENT_KERNELS);
+    GET_ATTR(ECC_ENABLED);
+    GET_ATTR(TCC_DRIVER);
+    GET_ATTR(MEMORY_CLOCK_RATE);
+    GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH);
+    GET_ATTR(L2_CACHE_SIZE);
+    GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR);
+    GET_ATTR(ASYNC_ENGINE_COUNT);
+    GET_ATTR(UNIFIED_ADDRESSING);
+    GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS);
+    GET_ATTR(CAN_TEX2D_GATHER);
+    GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE);
+    GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE);
+    GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE);
+    GET_ATTR(TEXTURE_PITCH_ALIGNMENT);
+    GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS);
+    GET_ATTR(MAXIMUM_SURFACE1D_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACE2D_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT);
+    GET_ATTR(MAXIMUM_SURFACE3D_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT);
+    GET_ATTR(MAXIMUM_SURFACE3D_DEPTH);
+    GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS);
+    GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT);
+    GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS);
+    GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS);
+    GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT);
+    GET_ATTR(COMPUTE_CAPABILITY_MAJOR);
+    GET_ATTR(COMPUTE_CAPABILITY_MINOR);
+    GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH);
+    GET_ATTR(STREAM_PRIORITIES_SUPPORTED);
+    GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED);
+    GET_ATTR(LOCAL_L1_CACHE_SUPPORTED);
+    GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
+    GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR);
+    GET_ATTR(MANAGED_MEMORY);
+    GET_ATTR(MULTI_GPU_BOARD);
+    GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
 #undef GET_ATTR
-		capabilities += "\n";
-	}
+    capabilities += "\n";
+  }
 
-	return capabilities;
+  return capabilities;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 1bb144ef85a..05a7fb8ae4d 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -21,314 +21,329 @@
 CCL_NAMESPACE_BEGIN
 
 DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
-: tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
-  profiler(NULL),
-  storage(device),
-  buffer(device),
-  device(device)
+    : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
+      profiler(NULL),
+      storage(device),
+      buffer(device),
+      device(device)
 {
-	radius = task.denoising.radius;
-	nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
-	if(task.denoising.relative_pca) {
-		pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
-	}
-	else {
-		pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
-	}
-
-	render_buffer.frame_stride = task.frame_stride;
-	render_buffer.pass_stride = task.pass_stride;
-	render_buffer.offset = task.pass_denoising_data;
-
-	target_buffer.pass_stride = task.target_pass_stride;
-	target_buffer.denoising_clean_offset = task.pass_denoising_clean;
-	target_buffer.offset = 0;
-
-	functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
-	functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
-
-	tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
-	tile_info->from_render = task.denoising_from_render? 1 : 0;
-
-	tile_info->frames[0] = 0;
-	tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
-	for(int i = 1; i < tile_info->num_frames; i++) {
-		tile_info->frames[i] = task.denoising_frames[i-1];
-	}
-
-	write_passes = task.denoising_write_passes;
-	do_filter = task.denoising_do_filter;
+  radius = task.denoising.radius;
+  nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
+  if (task.denoising.relative_pca) {
+    pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
+  }
+  else {
+    pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
+  }
+
+  render_buffer.frame_stride = task.frame_stride;
+  render_buffer.pass_stride = task.pass_stride;
+  render_buffer.offset = task.pass_denoising_data;
+
+  target_buffer.pass_stride = task.target_pass_stride;
+  target_buffer.denoising_clean_offset = task.pass_denoising_clean;
+  target_buffer.offset = 0;
+
+  functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
+  functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
+
+  tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int));
+  tile_info->from_render = task.denoising_from_render ? 1 : 0;
+
+  tile_info->frames[0] = 0;
+  tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
+  for (int i = 1; i < tile_info->num_frames; i++) {
+    tile_info->frames[i] = task.denoising_frames[i - 1];
+  }
+
+  write_passes = task.denoising_write_passes;
+  do_filter = task.denoising_do_filter;
 }
 
 DenoisingTask::~DenoisingTask()
 {
-	storage.XtWX.free();
-	storage.XtWY.free();
-	storage.transform.free();
-	storage.rank.free();
-	buffer.mem.free();
-	buffer.temporary_mem.free();
-	tile_info_mem.free();
+  storage.XtWX.free();
+  storage.XtWY.free();
+  storage.transform.free();
+  storage.rank.free();
+  buffer.mem.free();
+  buffer.temporary_mem.free();
+  tile_info_mem.free();
 }
 
 void DenoisingTask::set_render_buffer(RenderTile *rtiles)
 {
-	for(int i = 0; i < 9; i++) {
-		tile_info->offsets[i] = rtiles[i].offset;
-		tile_info->strides[i] = rtiles[i].stride;
-		tile_info->buffers[i] = rtiles[i].buffer;
-	}
-	tile_info->x[0] = rtiles[3].x;
-	tile_info->x[1] = rtiles[4].x;
-	tile_info->x[2] = rtiles[5].x;
-	tile_info->x[3] = rtiles[5].x + rtiles[5].w;
-	tile_info->y[0] = rtiles[1].y;
-	tile_info->y[1] = rtiles[4].y;
-	tile_info->y[2] = rtiles[7].y;
-	tile_info->y[3] = rtiles[7].y + rtiles[7].h;
-
-	target_buffer.offset = rtiles[9].offset;
-	target_buffer.stride = rtiles[9].stride;
-	target_buffer.ptr    = rtiles[9].buffer;
-
-	if(write_passes && rtiles[9].buffers) {
-		target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset();
-	}
-	else {
-		target_buffer.denoising_output_offset = 0;
-	}
-
-	tile_info_mem.copy_to_device();
+  for (int i = 0; i < 9; i++) {
+    tile_info->offsets[i] = rtiles[i].offset;
+    tile_info->strides[i] = rtiles[i].stride;
+    tile_info->buffers[i] = rtiles[i].buffer;
+  }
+  tile_info->x[0] = rtiles[3].x;
+  tile_info->x[1] = rtiles[4].x;
+  tile_info->x[2] = rtiles[5].x;
+  tile_info->x[3] = rtiles[5].x + rtiles[5].w;
+  tile_info->y[0] = rtiles[1].y;
+  tile_info->y[1] = rtiles[4].y;
+  tile_info->y[2] = rtiles[7].y;
+  tile_info->y[3] = rtiles[7].y + rtiles[7].h;
+
+  target_buffer.offset = rtiles[9].offset;
+  target_buffer.stride = rtiles[9].stride;
+  target_buffer.ptr = rtiles[9].buffer;
+
+  if (write_passes && rtiles[9].buffers) {
+    target_buffer.denoising_output_offset =
+        rtiles[9].buffers->params.get_denoising_prefiltered_offset();
+  }
+  else {
+    target_buffer.denoising_output_offset = 0;
+  }
+
+  tile_info_mem.copy_to_device();
 }
 
 void DenoisingTask::setup_denoising_buffer()
 {
-	/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
-	rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
-	rect = rect_expand(rect, radius);
-	rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
-
-	buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
-	buffer.passes = buffer.use_intensity? 15 : 14;
-	buffer.width = rect.z - rect.x;
-	buffer.stride = align_up(buffer.width, 4);
-	buffer.h = rect.w - rect.y;
-	int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
-	buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
-	buffer.frame_stride = buffer.pass_stride * buffer.passes;
-	/* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
-	int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
-	buffer.mem.alloc_to_device(mem_size, false);
-	buffer.use_time = (tile_info->num_frames > 1);
-
-	/* CPUs process shifts sequentially while GPUs process them in parallel. */
-	int num_layers;
-	if(buffer.gpu_temporary_mem) {
-		/* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
-		int max_radius = max(radius, 6);
-		int num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
-		num_layers = 2*num_shifts + 1;
-	}
-	else {
-		num_layers = 3;
-	}
-	/* Allocate two layers per shift as well as one for the weight accumulation. */
-	buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
+  /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
+  rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
+  rect = rect_expand(rect, radius);
+  rect = rect_clip(rect,
+                   make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
+
+  buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
+  buffer.passes = buffer.use_intensity ? 15 : 14;
+  buffer.width = rect.z - rect.x;
+  buffer.stride = align_up(buffer.width, 4);
+  buffer.h = rect.w - rect.y;
+  int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
+  buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
+  buffer.frame_stride = buffer.pass_stride * buffer.passes;
+  /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
+  int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
+  buffer.mem.alloc_to_device(mem_size, false);
+  buffer.use_time = (tile_info->num_frames > 1);
+
+  /* CPUs process shifts sequentially while GPUs process them in parallel. */
+  int num_layers;
+  if (buffer.gpu_temporary_mem) {
+    /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
+    int max_radius = max(radius, 6);
+    int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1);
+    num_layers = 2 * num_shifts + 1;
+  }
+  else {
+    num_layers = 3;
+  }
+  /* Allocate two layers per shift as well as one for the weight accumulation. */
+  buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
 }
 
 void DenoisingTask::prefilter_shadowing()
 {
-	device_ptr null_ptr = (device_ptr) 0;
-
-	device_sub_ptr unfiltered_a   (buffer.mem, 0,                    buffer.pass_stride);
-	device_sub_ptr unfiltered_b   (buffer.mem, 1*buffer.pass_stride, buffer.pass_stride);
-	device_sub_ptr sample_var     (buffer.mem, 2*buffer.pass_stride, buffer.pass_stride);
-	device_sub_ptr sample_var_var (buffer.mem, 3*buffer.pass_stride, buffer.pass_stride);
-	device_sub_ptr buffer_var     (buffer.mem, 5*buffer.pass_stride, buffer.pass_stride);
-	device_sub_ptr filtered_var   (buffer.mem, 6*buffer.pass_stride, buffer.pass_stride);
-
-	/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
-	functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
-
-	/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
-	nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
-	functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
-
-	/* Reuse memory, the previous data isn't needed anymore. */
-	device_ptr filtered_a = *buffer_var,
-	           filtered_b = *sample_var;
-	/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
-	nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
-	functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
-	functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
-
-	device_ptr residual_var = *sample_var_var;
-	/* Estimate the residual variance between the two filtered halves. */
-	functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
-
-	device_ptr final_a = *unfiltered_a,
-	           final_b = *unfiltered_b;
-	/* Use the residual variance for a second filter pass. */
-	nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
-	functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
-	functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
-
-	/* Combine the two double-filtered halves to a final shadow feature. */
-	device_sub_ptr shadow_pass(buffer.mem, 4*buffer.pass_stride, buffer.pass_stride);
-	functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
+  device_ptr null_ptr = (device_ptr)0;
+
+  device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride);
+  device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride);
+  device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride);
+  device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride);
+  device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride);
+  device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride);
+
+  /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+  functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
+
+  /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+  nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
+  functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
+
+  /* Reuse memory, the previous data isn't needed anymore. */
+  device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
+  /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+  nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
+  functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
+  functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
+
+  device_ptr residual_var = *sample_var_var;
+  /* Estimate the residual variance between the two filtered halves. */
+  functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
+
+  device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b;
+  /* Use the residual variance for a second filter pass. */
+  nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
+  functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
+  functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
+
+  /* Combine the two double-filtered halves to a final shadow feature. */
+  device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride);
+  functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
 }
 
 void DenoisingTask::prefilter_features()
 {
-	device_sub_ptr unfiltered     (buffer.mem,  8*buffer.pass_stride, buffer.pass_stride);
-	device_sub_ptr variance       (buffer.mem,  9*buffer.pass_stride, buffer.pass_stride);
-
-	int mean_from[]     = { 0, 1, 2, 12, 6,  7, 8 };
-	int variance_from[] = { 3, 4, 5, 13, 9, 10, 11};
-	int pass_to[]       = { 1, 2, 3, 0,  5,  6,  7};
-	for(int pass = 0; pass < 7; pass++) {
-		device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride);
-		/* Get the unfiltered pass and its variance from the RenderBuffers. */
-		functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance, 1.0f / render_buffer.samples);
-		/* Smooth the pass and store the result in the denoising buffers. */
-		nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
-		functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
-	}
+  device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride);
+  device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride);
+
+  int mean_from[] = {0, 1, 2, 12, 6, 7, 8};
+  int variance_from[] = {3, 4, 5, 13, 9, 10, 11};
+  int pass_to[] = {1, 2, 3, 0, 5, 6, 7};
+  for (int pass = 0; pass < 7; pass++) {
+    device_sub_ptr feature_pass(
+        buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride);
+    /* Get the unfiltered pass and its variance from the RenderBuffers. */
+    functions.get_feature(mean_from[pass],
+                          variance_from[pass],
+                          *unfiltered,
+                          *variance,
+                          1.0f / render_buffer.samples);
+    /* Smooth the pass and store the result in the denoising buffers. */
+    nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
+    functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
+  }
 }
 
 void DenoisingTask::prefilter_color()
 {
-	int mean_from[]     = {20, 21, 22};
-	int variance_from[] = {23, 24, 25};
-	int mean_to[]       = { 8,  9, 10};
-	int variance_to[]   = {11, 12, 13};
-	int num_color_passes = 3;
-
-	device_only_memory<float> temporary_color(device, "denoising temporary color");
-	temporary_color.alloc_to_device(3*buffer.pass_stride, false);
-
-	for(int pass = 0; pass < num_color_passes; pass++) {
-		device_sub_ptr color_pass(temporary_color, pass*buffer.pass_stride, buffer.pass_stride);
-		device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride);
-		functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass, 1.0f / render_buffer.samples);
-	}
-
-	device_sub_ptr depth_pass    (buffer.mem,                                 0,   buffer.pass_stride);
-	device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
-	device_sub_ptr output_pass   (buffer.mem,     mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
-	functions.detect_outliers(temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
-
-	if(buffer.use_intensity) {
-		device_sub_ptr intensity_pass(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
-		nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2*4.0f, true);
-		functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
-	}
+  int mean_from[] = {20, 21, 22};
+  int variance_from[] = {23, 24, 25};
+  int mean_to[] = {8, 9, 10};
+  int variance_to[] = {11, 12, 13};
+  int num_color_passes = 3;
+
+  device_only_memory<float> temporary_color(device, "denoising temporary color");
+  temporary_color.alloc_to_device(3 * buffer.pass_stride, false);
+
+  for (int pass = 0; pass < num_color_passes; pass++) {
+    device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
+    device_sub_ptr color_var_pass(
+        buffer.mem, variance_to[pass] * buffer.pass_stride, buffer.pass_stride);
+    functions.get_feature(mean_from[pass],
+                          variance_from[pass],
+                          *color_pass,
+                          *color_var_pass,
+                          1.0f / render_buffer.samples);
+  }
+
+  device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride);
+  device_sub_ptr color_var_pass(
+      buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
+  device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
+  functions.detect_outliers(
+      temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+
+  if (buffer.use_intensity) {
+    device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
+    nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true);
+    functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
+  }
 }
 
 void DenoisingTask::load_buffer()
 {
-	device_ptr null_ptr = (device_ptr) 0;
-
-	int original_offset = render_buffer.offset;
-
-	int num_passes = buffer.use_intensity? 15 : 14;
-	for(int i = 0; i < tile_info->num_frames; i++) {
-		for(int pass = 0; pass < num_passes; pass++) {
-			device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride);
-			bool is_variance = (pass >= 11) && (pass <= 13);
-			functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f);
-		}
-		render_buffer.offset += render_buffer.frame_stride;
-	}
-
-	render_buffer.offset = original_offset;
+  device_ptr null_ptr = (device_ptr)0;
+
+  int original_offset = render_buffer.offset;
+
+  int num_passes = buffer.use_intensity ? 15 : 14;
+  for (int i = 0; i < tile_info->num_frames; i++) {
+    for (int pass = 0; pass < num_passes; pass++) {
+      device_sub_ptr to_pass(
+          buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride);
+      bool is_variance = (pass >= 11) && (pass <= 13);
+      functions.get_feature(
+          pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f);
+    }
+    render_buffer.offset += render_buffer.frame_stride;
+  }
+
+  render_buffer.offset = original_offset;
 }
 
 void DenoisingTask::write_buffer()
 {
-	reconstruction_state.buffer_params = make_int4(target_buffer.offset,
-	                                               target_buffer.stride,
-	                                               target_buffer.pass_stride,
-	                                               target_buffer.denoising_clean_offset);
-	int num_passes = buffer.use_intensity? 15 : 14;
-	for(int pass = 0; pass < num_passes; pass++) {
-		device_sub_ptr from_pass(buffer.mem, pass*buffer.pass_stride, buffer.pass_stride);
-		int out_offset = pass + target_buffer.denoising_output_offset;
-		functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
-	}
+  reconstruction_state.buffer_params = make_int4(target_buffer.offset,
+                                                 target_buffer.stride,
+                                                 target_buffer.pass_stride,
+                                                 target_buffer.denoising_clean_offset);
+  int num_passes = buffer.use_intensity ? 15 : 14;
+  for (int pass = 0; pass < num_passes; pass++) {
+    device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride);
+    int out_offset = pass + target_buffer.denoising_output_offset;
+    functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
+  }
 }
 
 void DenoisingTask::construct_transform()
 {
-	storage.w = filter_area.z;
-	storage.h = filter_area.w;
+  storage.w = filter_area.z;
+  storage.h = filter_area.w;
 
-	storage.transform.alloc_to_device(storage.w*storage.h*TRANSFORM_SIZE, false);
-	storage.rank.alloc_to_device(storage.w*storage.h, false);
+  storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false);
+  storage.rank.alloc_to_device(storage.w * storage.h, false);
 
-	functions.construct_transform();
+  functions.construct_transform();
 }
 
 void DenoisingTask::reconstruct()
 {
-	storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
-	storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
-	storage.XtWX.zero_to_device();
-	storage.XtWY.zero_to_device();
-
-	reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
-	int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x;
-	reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
-	                                               target_buffer.stride,
-	                                               target_buffer.pass_stride,
-	                                               target_buffer.denoising_clean_offset);
-	reconstruction_state.source_w = rect.z-rect.x;
-	reconstruction_state.source_h = rect.w-rect.y;
-
-	device_sub_ptr color_ptr    (buffer.mem,  8*buffer.pass_stride, 3*buffer.pass_stride);
-	device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
-	for(int f = 0; f < tile_info->num_frames; f++) {
-		device_ptr scale_ptr = 0;
-		device_sub_ptr *scale_sub_ptr = NULL;
-		if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
-			scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
-			scale_ptr = **scale_sub_ptr;
-		}
-
-		functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
-		delete scale_sub_ptr;
-	}
-	functions.solve(target_buffer.ptr);
+  storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false);
+  storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false);
+  storage.XtWX.zero_to_device();
+  storage.XtWY.zero_to_device();
+
+  reconstruction_state.filter_window = rect_from_shape(
+      filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h);
+  int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x;
+  reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
+                                                 target_buffer.stride,
+                                                 target_buffer.pass_stride,
+                                                 target_buffer.denoising_clean_offset);
+  reconstruction_state.source_w = rect.z - rect.x;
+  reconstruction_state.source_h = rect.w - rect.y;
+
+  device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride);
+  device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride);
+  for (int f = 0; f < tile_info->num_frames; f++) {
+    device_ptr scale_ptr = 0;
+    device_sub_ptr *scale_sub_ptr = NULL;
+    if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
+      scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
+      scale_ptr = **scale_sub_ptr;
+    }
+
+    functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
+    delete scale_sub_ptr;
+  }
+  functions.solve(target_buffer.ptr);
 }
 
 void DenoisingTask::run_denoising(RenderTile *tile)
 {
-	RenderTile rtiles[10];
-	rtiles[4] = *tile;
-	functions.map_neighbor_tiles(rtiles);
-	set_render_buffer(rtiles);
-
-	setup_denoising_buffer();
-
-	if(tile_info->from_render) {
-		prefilter_shadowing();
-		prefilter_features();
-		prefilter_color();
-	}
-	else {
-		load_buffer();
-	}
-
-	if(do_filter) {
-		construct_transform();
-		reconstruct();
-	}
-
-	if(write_passes) {
-		write_buffer();
-	}
-
-	functions.unmap_neighbor_tiles(rtiles);
+  RenderTile rtiles[10];
+  rtiles[4] = *tile;
+  functions.map_neighbor_tiles(rtiles);
+  set_render_buffer(rtiles);
+
+  setup_denoising_buffer();
+
+  if (tile_info->from_render) {
+    prefilter_shadowing();
+    prefilter_features();
+    prefilter_color();
+  }
+  else {
+    load_buffer();
+  }
+
+  if (do_filter) {
+    construct_transform();
+    reconstruct();
+  }
+
+  if (write_passes) {
+    write_buffer();
+  }
+
+  functions.unmap_neighbor_tiles(rtiles);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 5869aa05390..bd1d0193dbd 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -28,165 +28,169 @@
 CCL_NAMESPACE_BEGIN
 
 class DenoisingTask {
-public:
-	/* Parameters of the denoising algorithm. */
-	int radius;
-	float nlm_k_2;
-	float pca_threshold;
-
-	/* Parameters of the RenderBuffers. */
-	struct RenderBuffers {
-		int offset;
-		int pass_stride;
-		int frame_stride;
-		int samples;
-	} render_buffer;
-
-	/* Pointer and parameters of the target buffer. */
-	struct TargetBuffer {
-		int offset;
-		int stride;
-		int pass_stride;
-		int denoising_clean_offset;
-		int denoising_output_offset;
-		device_ptr ptr;
-	} target_buffer;
-
-	TileInfo *tile_info;
-	device_vector<int> tile_info_mem;
-
-	ProfilingState *profiler;
-
-	int4 rect;
-	int4 filter_area;
-
-	bool write_passes;
-	bool do_filter;
-
-	struct DeviceFunctions {
-		function<bool(device_ptr image_ptr,    /* Contains the values that are smoothed. */
-		              device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
-		              device_ptr variance_ptr, /* Contains the variance of the guide image. */
-		              device_ptr out_ptr       /* The filtered output is written into this image. */
-		              )> non_local_means;
-		function<bool(device_ptr color_ptr,
-		              device_ptr color_variance_ptr,
-		              device_ptr scale_ptr,
-		              int frame
-		              )> accumulate;
-		function<bool(device_ptr output_ptr)> solve;
-		function<bool()> construct_transform;
-
-		function<bool(device_ptr a_ptr,
-		              device_ptr b_ptr,
-		              device_ptr mean_ptr,
-		              device_ptr variance_ptr,
-		              int r,
-		              int4 rect
-		              )> combine_halves;
-		function<bool(device_ptr a_ptr,
-		              device_ptr b_ptr,
-		              device_ptr sample_variance_ptr,
-		              device_ptr sv_variance_ptr,
-		              device_ptr buffer_variance_ptr
-		              )> divide_shadow;
-		function<bool(int mean_offset,
-		              int variance_offset,
-		              device_ptr mean_ptr,
-		              device_ptr variance_ptr,
-		              float scale
-		              )> get_feature;
-		function<bool(device_ptr image_ptr,
-		              device_ptr variance_ptr,
-		              device_ptr depth_ptr,
-		              device_ptr output_ptr
-		              )> detect_outliers;
-		function<bool(int out_offset,
-		              device_ptr frop_ptr,
-		              device_ptr buffer_ptr
-		              )> write_feature;
-		function<void(RenderTile *rtiles)> map_neighbor_tiles;
-		function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
-	} functions;
-
-	/* Stores state of the current Reconstruction operation,
-	 * which is accessed by the device in order to perform the operation. */
-	struct ReconstructionState {
-		int4 filter_window;
-		int4 buffer_params;
-
-		int source_w;
-		int source_h;
-	} reconstruction_state;
-
-	/* Stores state of the current NLM operation,
-	 * which is accessed by the device in order to perform the operation. */
-	struct NLMState {
-		int r;      /* Search radius of the filter. */
-		int f;      /* Patch size of the filter. */
-		float a;    /* Variance compensation factor in the MSE estimation. */
-		float k_2;  /* Squared value of the k parameter of the filter. */
-		bool is_color;
-
-		void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) { r = r_; f = f_; a = a_, k_2 = k_2_; is_color = is_color_; }
-	} nlm_state;
-
-	struct Storage {
-		device_only_memory<float>  transform;
-		device_only_memory<int>    rank;
-		device_only_memory<float>  XtWX;
-		device_only_memory<float3> XtWY;
-		int w;
-		int h;
-
-		Storage(Device *device)
-		: transform(device, "denoising transform"),
-		  rank(device, "denoising rank"),
-		  XtWX(device, "denoising XtWX"),
-		  XtWY(device, "denoising XtWY")
-		{}
-	} storage;
-
-	DenoisingTask(Device *device, const DeviceTask &task);
-	~DenoisingTask();
-
-	void run_denoising(RenderTile *tile);
-
-	struct DenoiseBuffers {
-		int pass_stride;
-		int passes;
-		int stride;
-		int h;
-		int width;
-		int frame_stride;
-		device_only_memory<float> mem;
-		device_only_memory<float> temporary_mem;
-		bool use_time;
-		bool use_intensity;
-
-		bool gpu_temporary_mem;
-
-		DenoiseBuffers(Device *device)
-		: mem(device, "denoising pixel buffer"),
-		  temporary_mem(device, "denoising temporary mem")
-	    {}
-	} buffer;
-
-protected:
-	Device *device;
-
-	void set_render_buffer(RenderTile *rtiles);
-	void setup_denoising_buffer();
-	void prefilter_shadowing();
-	void prefilter_features();
-	void prefilter_color();
-	void construct_transform();
-	void reconstruct();
-
-	void load_buffer();
-	void write_buffer();
+ public:
+  /* Parameters of the denoising algorithm. */
+  int radius;
+  float nlm_k_2;
+  float pca_threshold;
+
+  /* Parameters of the RenderBuffers. */
+  struct RenderBuffers {
+    int offset;
+    int pass_stride;
+    int frame_stride;
+    int samples;
+  } render_buffer;
+
+  /* Pointer and parameters of the target buffer. */
+  struct TargetBuffer {
+    int offset;
+    int stride;
+    int pass_stride;
+    int denoising_clean_offset;
+    int denoising_output_offset;
+    device_ptr ptr;
+  } target_buffer;
+
+  TileInfo *tile_info;
+  device_vector<int> tile_info_mem;
+
+  ProfilingState *profiler;
+
+  int4 rect;
+  int4 filter_area;
+
+  bool write_passes;
+  bool do_filter;
+
+  struct DeviceFunctions {
+    function<bool(
+        device_ptr image_ptr,    /* Contains the values that are smoothed. */
+        device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
+        device_ptr variance_ptr, /* Contains the variance of the guide image. */
+        device_ptr out_ptr       /* The filtered output is written into this image. */
+        )>
+        non_local_means;
+    function<bool(
+        device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)>
+        accumulate;
+    function<bool(device_ptr output_ptr)> solve;
+    function<bool()> construct_transform;
+
+    function<bool(device_ptr a_ptr,
+                  device_ptr b_ptr,
+                  device_ptr mean_ptr,
+                  device_ptr variance_ptr,
+                  int r,
+                  int4 rect)>
+        combine_halves;
+    function<bool(device_ptr a_ptr,
+                  device_ptr b_ptr,
+                  device_ptr sample_variance_ptr,
+                  device_ptr sv_variance_ptr,
+                  device_ptr buffer_variance_ptr)>
+        divide_shadow;
+    function<bool(int mean_offset,
+                  int variance_offset,
+                  device_ptr mean_ptr,
+                  device_ptr variance_ptr,
+                  float scale)>
+        get_feature;
+    function<bool(device_ptr image_ptr,
+                  device_ptr variance_ptr,
+                  device_ptr depth_ptr,
+                  device_ptr output_ptr)>
+        detect_outliers;
+    function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
+    function<void(RenderTile *rtiles)> map_neighbor_tiles;
+    function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
+  } functions;
+
+  /* Stores state of the current Reconstruction operation,
+   * which is accessed by the device in order to perform the operation. */
+  struct ReconstructionState {
+    int4 filter_window;
+    int4 buffer_params;
+
+    int source_w;
+    int source_h;
+  } reconstruction_state;
+
+  /* Stores state of the current NLM operation,
+   * which is accessed by the device in order to perform the operation. */
+  struct NLMState {
+    int r;     /* Search radius of the filter. */
+    int f;     /* Patch size of the filter. */
+    float a;   /* Variance compensation factor in the MSE estimation. */
+    float k_2; /* Squared value of the k parameter of the filter. */
+    bool is_color;
+
+    void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
+    {
+      r = r_;
+      f = f_;
+      a = a_, k_2 = k_2_;
+      is_color = is_color_;
+    }
+  } nlm_state;
+
+  struct Storage {
+    device_only_memory<float> transform;
+    device_only_memory<int> rank;
+    device_only_memory<float> XtWX;
+    device_only_memory<float3> XtWY;
+    int w;
+    int h;
+
+    Storage(Device *device)
+        : transform(device, "denoising transform"),
+          rank(device, "denoising rank"),
+          XtWX(device, "denoising XtWX"),
+          XtWY(device, "denoising XtWY")
+    {
+    }
+  } storage;
+
+  DenoisingTask(Device *device, const DeviceTask &task);
+  ~DenoisingTask();
+
+  void run_denoising(RenderTile *tile);
+
+  struct DenoiseBuffers {
+    int pass_stride;
+    int passes;
+    int stride;
+    int h;
+    int width;
+    int frame_stride;
+    device_only_memory<float> mem;
+    device_only_memory<float> temporary_mem;
+    bool use_time;
+    bool use_intensity;
+
+    bool gpu_temporary_mem;
+
+    DenoiseBuffers(Device *device)
+        : mem(device, "denoising pixel buffer"), temporary_mem(device, "denoising temporary mem")
+    {
+    }
+  } buffer;
+
+ protected:
+  Device *device;
+
+  void set_render_buffer(RenderTile *rtiles);
+  void setup_denoising_buffer();
+  void prefilter_shadowing();
+  void prefilter_features();
+  void prefilter_color();
+  void construct_transform();
+  void reconstruct();
+
+  void load_buffer();
+  void write_buffer();
 };
 
 CCL_NAMESPACE_END
 
-#endif  /* __DEVICE_DENOISING_H__ */
+#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index 94df1e009eb..c393a3f9cda 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -21,19 +21,22 @@ CCL_NAMESPACE_BEGIN
 
 class Device;
 
-Device *device_cpu_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
+Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
 bool device_opencl_init();
-Device *device_opencl_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_compile_kernel(const vector<string>& parameters);
+Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+bool device_opencl_compile_kernel(const vector<string> &parameters);
 bool device_cuda_init();
-Device *device_cuda_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
-Device *device_network_create(DeviceInfo& info, Stats &stats, Profiler &profiler, const char *address);
-Device *device_multi_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
-
-void device_cpu_info(vector<DeviceInfo>& devices);
-void device_opencl_info(vector<DeviceInfo>& devices);
-void device_cuda_info(vector<DeviceInfo>& devices);
-void device_network_info(vector<DeviceInfo>& devices);
+Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+Device *device_network_create(DeviceInfo &info,
+                              Stats &stats,
+                              Profiler &profiler,
+                              const char *address);
+Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+
+void device_cpu_info(vector<DeviceInfo> &devices);
+void device_opencl_info(vector<DeviceInfo> &devices);
+void device_cuda_info(vector<DeviceInfo> &devices);
+void device_network_info(vector<DeviceInfo> &devices);
 
 string device_cpu_capabilities();
 string device_opencl_capabilities();
@@ -41,4 +44,4 @@ string device_cuda_capabilities();
 
 CCL_NAMESPACE_END
 
-#endif  /* __DEVICE_INTERN_H__ */
+#endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index a8d29896553..859535307f4 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -22,21 +22,21 @@ CCL_NAMESPACE_BEGIN
 /* Device Memory */
 
 device_memory::device_memory(Device *device, const char *name, MemoryType type)
-: data_type(device_type_traits<uchar>::data_type),
-  data_elements(device_type_traits<uchar>::num_elements),
-  data_size(0),
-  device_size(0),
-  data_width(0),
-  data_height(0),
-  data_depth(0),
-  type(type),
-  name(name),
-  interpolation(INTERPOLATION_NONE),
-  extension(EXTENSION_REPEAT),
-  device(device),
-  device_pointer(0),
-  host_pointer(0),
-  shared_pointer(0)
+    : data_type(device_type_traits<uchar>::data_type),
+      data_elements(device_type_traits<uchar>::num_elements),
+      data_size(0),
+      device_size(0),
+      data_width(0),
+      data_height(0),
+      data_depth(0),
+      type(type),
+      name(name),
+      interpolation(INTERPOLATION_NONE),
+      extension(EXTENSION_REPEAT),
+      device(device),
+      device_pointer(0),
+      host_pointer(0),
+      shared_pointer(0)
 {
 }
 
@@ -46,95 +46,94 @@ device_memory::~device_memory()
 
 void *device_memory::host_alloc(size_t size)
 {
-	if(!size) {
-		return 0;
-	}
+  if (!size) {
+    return 0;
+  }
 
-	void *ptr = util_aligned_malloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES);
+  void *ptr = util_aligned_malloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES);
 
-	if(ptr) {
-		util_guarded_mem_alloc(size);
-	}
-	else {
-		throw std::bad_alloc();
-	}
+  if (ptr) {
+    util_guarded_mem_alloc(size);
+  }
+  else {
+    throw std::bad_alloc();
+  }
 
-	return ptr;
+  return ptr;
 }
 
 void device_memory::host_free()
 {
-	if(host_pointer) {
-		util_guarded_mem_free(memory_size());
-		util_aligned_free((void*)host_pointer);
-		host_pointer = 0;
-	}
+  if (host_pointer) {
+    util_guarded_mem_free(memory_size());
+    util_aligned_free((void *)host_pointer);
+    host_pointer = 0;
+  }
 }
 
 void device_memory::device_alloc()
 {
-	assert(!device_pointer && type != MEM_TEXTURE);
-	device->mem_alloc(*this);
+  assert(!device_pointer && type != MEM_TEXTURE);
+  device->mem_alloc(*this);
 }
 
 void device_memory::device_free()
 {
-	if(device_pointer) {
-		device->mem_free(*this);
-	}
+  if (device_pointer) {
+    device->mem_free(*this);
+  }
 }
 
 void device_memory::device_copy_to()
 {
-	if(host_pointer) {
-		device->mem_copy_to(*this);
-	}
+  if (host_pointer) {
+    device->mem_copy_to(*this);
+  }
 }
 
 void device_memory::device_copy_from(int y, int w, int h, int elem)
 {
-	assert(type != MEM_TEXTURE && type != MEM_READ_ONLY);
-	device->mem_copy_from(*this, y, w, h, elem);
+  assert(type != MEM_TEXTURE && type != MEM_READ_ONLY);
+  device->mem_copy_from(*this, y, w, h, elem);
 }
 
 void device_memory::device_zero()
 {
-	if(data_size) {
-		device->mem_zero(*this);
-	}
+  if (data_size) {
+    device->mem_zero(*this);
+  }
 }
 
 void device_memory::swap_device(Device *new_device,
                                 size_t new_device_size,
                                 device_ptr new_device_ptr)
 {
-	original_device = device;
-	original_device_size = device_size;
-	original_device_ptr = device_pointer;
+  original_device = device;
+  original_device_size = device_size;
+  original_device_ptr = device_pointer;
 
-	device = new_device;
-	device_size = new_device_size;
-	device_pointer = new_device_ptr;
+  device = new_device;
+  device_size = new_device_size;
+  device_pointer = new_device_ptr;
 }
 
 void device_memory::restore_device()
 {
-	device = original_device;
-	device_size = original_device_size;
-	device_pointer = original_device_ptr;
+  device = original_device;
+  device_size = original_device_size;
+  device_pointer = original_device_ptr;
 }
 
 /* Device Sub Ptr */
 
-device_sub_ptr::device_sub_ptr(device_memory& mem, int offset, int size)
-: device(mem.device)
+device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device)
 {
-	ptr = device->mem_alloc_sub_ptr(mem, offset, size);
+  ptr = device->mem_alloc_sub_ptr(mem, offset, size);
 }
 
 device_sub_ptr::~device_sub_ptr()
 {
-	device->mem_free_sub_ptr(ptr);
+  device->mem_free_sub_ptr(ptr);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index e43834bdc8d..f50184efba7 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -31,152 +31,155 @@ CCL_NAMESPACE_BEGIN
 
 class Device;
 
-enum MemoryType {
-	MEM_READ_ONLY,
-	MEM_READ_WRITE,
-	MEM_DEVICE_ONLY,
-	MEM_TEXTURE,
-	MEM_PIXELS
-};
+enum MemoryType { MEM_READ_ONLY, MEM_READ_WRITE, MEM_DEVICE_ONLY, MEM_TEXTURE, MEM_PIXELS };
 
 /* Supported Data Types */
 
 enum DataType {
-	TYPE_UNKNOWN,
-	TYPE_UCHAR,
-	TYPE_UINT16,
-	TYPE_UINT,
-	TYPE_INT,
-	TYPE_FLOAT,
-	TYPE_HALF,
-	TYPE_UINT64,
+  TYPE_UNKNOWN,
+  TYPE_UCHAR,
+  TYPE_UINT16,
+  TYPE_UINT,
+  TYPE_INT,
+  TYPE_FLOAT,
+  TYPE_HALF,
+  TYPE_UINT64,
 };
 
 static inline size_t datatype_size(DataType datatype)
 {
-	switch(datatype) {
-		case TYPE_UNKNOWN: return 1;
-		case TYPE_UCHAR: return sizeof(uchar);
-		case TYPE_FLOAT: return sizeof(float);
-		case TYPE_UINT: return sizeof(uint);
-		case TYPE_UINT16: return sizeof(uint16_t);
-		case TYPE_INT: return sizeof(int);
-		case TYPE_HALF: return sizeof(half);
-		case TYPE_UINT64: return sizeof(uint64_t);
-		default: return 0;
-	}
+  switch (datatype) {
+    case TYPE_UNKNOWN:
+      return 1;
+    case TYPE_UCHAR:
+      return sizeof(uchar);
+    case TYPE_FLOAT:
+      return sizeof(float);
+    case TYPE_UINT:
+      return sizeof(uint);
+    case TYPE_UINT16:
+      return sizeof(uint16_t);
+    case TYPE_INT:
+      return sizeof(int);
+    case TYPE_HALF:
+      return sizeof(half);
+    case TYPE_UINT64:
+      return sizeof(uint64_t);
+    default:
+      return 0;
+  }
 }
 
 /* Traits for data types */
 
 template<typename T> struct device_type_traits {
-	static const DataType data_type = TYPE_UNKNOWN;
-	static const int num_elements = sizeof(T);
+  static const DataType data_type = TYPE_UNKNOWN;
+  static const int num_elements = sizeof(T);
 };
 
 template<> struct device_type_traits<uchar> {
-	static const DataType data_type = TYPE_UCHAR;
-	static const int num_elements = 1;
+  static const DataType data_type = TYPE_UCHAR;
+  static const int num_elements = 1;
 };
 
 template<> struct device_type_traits<uchar2> {
-	static const DataType data_type = TYPE_UCHAR;
-	static const int num_elements = 2;
+  static const DataType data_type = TYPE_UCHAR;
+  static const int num_elements = 2;
 };
 
 template<> struct device_type_traits<uchar3> {
-	static const DataType data_type = TYPE_UCHAR;
-	static const int num_elements = 3;
+  static const DataType data_type = TYPE_UCHAR;
+  static const int num_elements = 3;
 };
 
 template<> struct device_type_traits<uchar4> {
-	static const DataType data_type = TYPE_UCHAR;
-	static const int num_elements = 4;
+  static const DataType data_type = TYPE_UCHAR;
+  static const int num_elements = 4;
 };
 
 template<> struct device_type_traits<uint> {
-	static const DataType data_type = TYPE_UINT;
-	static const int num_elements = 1;
+  static const DataType data_type = TYPE_UINT;
+  static const int num_elements = 1;
 };
 
 template<> struct device_type_traits<uint2> {
-	static const DataType data_type = TYPE_UINT;
-	static const int num_elements = 2;
+  static const DataType data_type = TYPE_UINT;
+  static const int num_elements = 2;
 };
 
 template<> struct device_type_traits<uint3> {
-	static const DataType data_type = TYPE_UINT;
-	static const int num_elements = 3;
+  static const DataType data_type = TYPE_UINT;
+  static const int num_elements = 3;
 };
 
 template<> struct device_type_traits<uint4> {
-	static const DataType data_type = TYPE_UINT;
-	static const int num_elements = 4;
+  static const DataType data_type = TYPE_UINT;
+  static const int num_elements = 4;
 };
 
 template<> struct device_type_traits<int> {
-	static const DataType data_type = TYPE_INT;
-	static const int num_elements = 1;
+  static const DataType data_type = TYPE_INT;
+  static const int num_elements = 1;
 };
 
 template<> struct device_type_traits<int2> {
-	static const DataType data_type = TYPE_INT;
-	static const int num_elements = 2;
+  static const DataType data_type = TYPE_INT;
+  static const int num_elements = 2;
 };
 
 template<> struct device_type_traits<int3> {
-	static const DataType data_type = TYPE_INT;
-	static const int num_elements = 3;
+  static const DataType data_type = TYPE_INT;
+  static const int num_elements = 3;
 };
 
 template<> struct device_type_traits<int4> {
-	static const DataType data_type = TYPE_INT;
-	static const int num_elements = 4;
+  static const DataType data_type = TYPE_INT;
+  static const int num_elements = 4;
 };
 
 template<> struct device_type_traits<float> {
-	static const DataType data_type = TYPE_FLOAT;
-	static const int num_elements = 1;
+  static const DataType data_type = TYPE_FLOAT;
+  static const int num_elements = 1;
 };
 
 template<> struct device_type_traits<float2> {
-	static const DataType data_type = TYPE_FLOAT;
-	static const int num_elements = 2;
+  static const DataType data_type = TYPE_FLOAT;
+  static const int num_elements = 2;
 };
 
 template<> struct device_type_traits<float3> {
-	static const DataType data_type = TYPE_FLOAT;
-	static const int num_elements = 4;
+  static const DataType data_type = TYPE_FLOAT;
+  static const int num_elements = 4;
 };
 
 template<> struct device_type_traits<float4> {
-	static const DataType data_type = TYPE_FLOAT;
-	static const int num_elements = 4;
+  static const DataType data_type = TYPE_FLOAT;
+  static const int num_elements = 4;
 };
 
 template<> struct device_type_traits<half> {
-	static const DataType data_type = TYPE_HALF;
-	static const int num_elements = 1;
+  static const DataType data_type = TYPE_HALF;
+  static const int num_elements = 1;
 };
 
 template<> struct device_type_traits<ushort4> {
-	static const DataType data_type = TYPE_UINT16;
-	static const int num_elements = 4;
+  static const DataType data_type = TYPE_UINT16;
+  static const int num_elements = 4;
 };
 
 template<> struct device_type_traits<uint16_t> {
-	static const DataType data_type = TYPE_UINT16;
-	static const int num_elements = 1;
+  static const DataType data_type = TYPE_UINT16;
+  static const int num_elements = 1;
 };
 
 template<> struct device_type_traits<half4> {
-	static const DataType data_type = TYPE_HALF;
-	static const int num_elements = 4;
+  static const DataType data_type = TYPE_HALF;
+  static const int num_elements = 4;
 };
 
 template<> struct device_type_traits<uint64_t> {
-	static const DataType data_type = TYPE_UINT64;
-	static const int num_elements = 1;
+  static const DataType data_type = TYPE_UINT64;
+  static const int num_elements = 1;
 };
 
 /* Device Memory
@@ -184,64 +187,67 @@ template<> struct device_type_traits<uint64_t> {
  * Base class for all device memory. This should not be allocated directly,
  * instead the appropriate subclass can be used. */
 
-class device_memory
-{
-public:
-	size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
-	size_t memory_elements_size(int elements) {
-		return elements*data_elements*datatype_size(data_type);
-	}
-
-	/* Data information. */
-	DataType data_type;
-	int data_elements;
-	size_t data_size;
-	size_t device_size;
-	size_t data_width;
-	size_t data_height;
-	size_t data_depth;
-	MemoryType type;
-	const char *name;
-	InterpolationType interpolation;
-	ExtensionType extension;
-
-	/* Pointers. */
-	Device *device;
-	device_ptr device_pointer;
-	void *host_pointer;
-	void *shared_pointer;
-
-	virtual ~device_memory();
-
-	void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr);
-	void restore_device();
-
-protected:
-	friend class CUDADevice;
-
-	/* Only create through subclasses. */
-	device_memory(Device *device, const char *name, MemoryType type);
-
-	/* No copying allowed. */
-	device_memory(const device_memory&);
-	device_memory& operator = (const device_memory&);
-
-	/* Host allocation on the device. All host_pointer memory should be
-	 * allocated with these functions, for devices that support using
-	 * the same pointer for host and device. */
-	void *host_alloc(size_t size);
-	void host_free();
-
-	/* Device memory allocation and copying. */
-	void device_alloc();
-	void device_free();
-	void device_copy_to();
-	void device_copy_from(int y, int w, int h, int elem);
-	void device_zero();
-
-	device_ptr original_device_ptr;
-	size_t original_device_size;
-	Device *original_device;
+class device_memory {
+ public:
+  size_t memory_size()
+  {
+    return data_size * data_elements * datatype_size(data_type);
+  }
+  size_t memory_elements_size(int elements)
+  {
+    return elements * data_elements * datatype_size(data_type);
+  }
+
+  /* Data information. */
+  DataType data_type;
+  int data_elements;
+  size_t data_size;
+  size_t device_size;
+  size_t data_width;
+  size_t data_height;
+  size_t data_depth;
+  MemoryType type;
+  const char *name;
+  InterpolationType interpolation;
+  ExtensionType extension;
+
+  /* Pointers. */
+  Device *device;
+  device_ptr device_pointer;
+  void *host_pointer;
+  void *shared_pointer;
+
+  virtual ~device_memory();
+
+  void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr);
+  void restore_device();
+
+ protected:
+  friend class CUDADevice;
+
+  /* Only create through subclasses. */
+  device_memory(Device *device, const char *name, MemoryType type);
+
+  /* No copying allowed. */
+  device_memory(const device_memory &);
+  device_memory &operator=(const device_memory &);
+
+  /* Host allocation on the device. All host_pointer memory should be
+   * allocated with these functions, for devices that support using
+   * the same pointer for host and device. */
+  void *host_alloc(size_t size);
+  void host_free();
+
+  /* Device memory allocation and copying. */
+  void device_alloc();
+  void device_free();
+  void device_copy_to();
+  void device_copy_from(int y, int w, int h, int elem);
+  void device_zero();
+
+  device_ptr original_device_ptr;
+  size_t original_device_size;
+  Device *original_device;
 };
 
 /* Device Only Memory
@@ -249,51 +255,49 @@ protected:
  * Working memory only needed by the device, with no corresponding allocation
  * on the host. Only used internally in the device implementations. */
 
-template<typename T>
-class device_only_memory : public device_memory
-{
-public:
-	device_only_memory(Device *device, const char *name)
-	: device_memory(device, name, MEM_DEVICE_ONLY)
-	{
-		data_type = device_type_traits<T>::data_type;
-		data_elements = max(device_type_traits<T>::num_elements, 1);
-	}
-
-	virtual ~device_only_memory()
-	{
-		free();
-	}
-
-	void alloc_to_device(size_t num, bool shrink_to_fit = true)
-	{
-		size_t new_size = num;
-		bool reallocate;
-
-		if(shrink_to_fit) {
-			reallocate = (data_size != new_size);
-		}
-		else {
-			reallocate = (data_size < new_size);
-		}
-
-		if(reallocate) {
-			device_free();
-			data_size = new_size;
-			device_alloc();
-		}
-	}
-
-	void free()
-	{
-		device_free();
-		data_size = 0;
-	}
-
-	void zero_to_device()
-	{
-		device_zero();
-	}
+template<typename T> class device_only_memory : public device_memory {
+ public:
+  device_only_memory(Device *device, const char *name)
+      : device_memory(device, name, MEM_DEVICE_ONLY)
+  {
+    data_type = device_type_traits<T>::data_type;
+    data_elements = max(device_type_traits<T>::num_elements, 1);
+  }
+
+  virtual ~device_only_memory()
+  {
+    free();
+  }
+
+  void alloc_to_device(size_t num, bool shrink_to_fit = true)
+  {
+    size_t new_size = num;
+    bool reallocate;
+
+    if (shrink_to_fit) {
+      reallocate = (data_size != new_size);
+    }
+    else {
+      reallocate = (data_size < new_size);
+    }
+
+    if (reallocate) {
+      device_free();
+      data_size = new_size;
+      device_alloc();
+    }
+  }
+
+  void free()
+  {
+    device_free();
+    data_size = 0;
+  }
+
+  void zero_to_device()
+  {
+    device_zero();
+  }
 };
 
 /* Device Vector
@@ -307,135 +311,134 @@ public:
  * automatically attached to kernel globals, using the provided name
  * matching an entry in kernel_textures.h. */
 
-template<typename T> class device_vector : public device_memory
-{
-public:
-	device_vector(Device *device, const char *name, MemoryType type)
-	: device_memory(device, name, type)
-	{
-		data_type = device_type_traits<T>::data_type;
-		data_elements = device_type_traits<T>::num_elements;
-
-		assert(data_elements > 0);
-	}
-
-	virtual ~device_vector()
-	{
-		free();
-	}
-
-	/* Host memory allocation. */
-	T *alloc(size_t width, size_t height = 0, size_t depth = 0)
-	{
-		size_t new_size = size(width, height, depth);
-
-		if(new_size != data_size) {
-			device_free();
-			host_free();
-			host_pointer = host_alloc(sizeof(T)*new_size);
-			assert(device_pointer == 0);
-		}
-
-		data_size = new_size;
-		data_width = width;
-		data_height = height;
-		data_depth = depth;
-
-		return data();
-	}
-
-	/* Host memory resize. Only use this if the original data needs to be
-	 * preserved, it is faster to call alloc() if it can be discarded. */
-	T *resize(size_t width, size_t height = 0, size_t depth = 0)
-	{
-		size_t new_size = size(width, height, depth);
-
-		if(new_size != data_size) {
-			void *new_ptr = host_alloc(sizeof(T)*new_size);
-
-			if(new_size && data_size) {
-				size_t min_size = ((new_size < data_size)? new_size: data_size);
-				memcpy((T*)new_ptr, (T*)host_pointer, sizeof(T)*min_size);
-			}
-
-			device_free();
-			host_free();
-			host_pointer = new_ptr;
-			assert(device_pointer == 0);
-		}
-
-		data_size = new_size;
-		data_width = width;
-		data_height = height;
-		data_depth = depth;
-
-		return data();
-	}
-
-	/* Take over data from an existing array. */
-	void steal_data(array<T>& from)
-	{
-		device_free();
-		host_free();
-
-		data_size = from.size();
-		data_width = 0;
-		data_height = 0;
-		data_depth = 0;
-		host_pointer = from.steal_pointer();
-		assert(device_pointer == 0);
-	}
-
-	/* Free device and host memory. */
-	void free()
-	{
-		device_free();
-		host_free();
-
-		data_size = 0;
-		data_width = 0;
-		data_height = 0;
-		data_depth = 0;
-		host_pointer = 0;
-		assert(device_pointer == 0);
-	}
-
-	size_t size()
-	{
-		return data_size;
-	}
-
-	T* data()
-	{
-		return (T*)host_pointer;
-	}
-
-	T& operator[](size_t i)
-	{
-		assert(i < data_size);
-		return data()[i];
-	}
-
-	void copy_to_device()
-	{
-		device_copy_to();
-	}
-
-	void copy_from_device(int y, int w, int h)
-	{
-		device_copy_from(y, w, h, sizeof(T));
-	}
-
-	void zero_to_device()
-	{
-		device_zero();
-	}
-
-protected:
-	size_t size(size_t width, size_t height, size_t depth)
-	{
-		return width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth);
-	}
+template<typename T> class device_vector : public device_memory {
+ public:
+  device_vector(Device *device, const char *name, MemoryType type)
+      : device_memory(device, name, type)
+  {
+    data_type = device_type_traits<T>::data_type;
+    data_elements = device_type_traits<T>::num_elements;
+
+    assert(data_elements > 0);
+  }
+
+  virtual ~device_vector()
+  {
+    free();
+  }
+
+  /* Host memory allocation. */
+  T *alloc(size_t width, size_t height = 0, size_t depth = 0)
+  {
+    size_t new_size = size(width, height, depth);
+
+    if (new_size != data_size) {
+      device_free();
+      host_free();
+      host_pointer = host_alloc(sizeof(T) * new_size);
+      assert(device_pointer == 0);
+    }
+
+    data_size = new_size;
+    data_width = width;
+    data_height = height;
+    data_depth = depth;
+
+    return data();
+  }
+
+  /* Host memory resize. Only use this if the original data needs to be
+   * preserved, it is faster to call alloc() if it can be discarded. */
+  T *resize(size_t width, size_t height = 0, size_t depth = 0)
+  {
+    size_t new_size = size(width, height, depth);
+
+    if (new_size != data_size) {
+      void *new_ptr = host_alloc(sizeof(T) * new_size);
+
+      if (new_size && data_size) {
+        size_t min_size = ((new_size < data_size) ? new_size : data_size);
+        memcpy((T *)new_ptr, (T *)host_pointer, sizeof(T) * min_size);
+      }
+
+      device_free();
+      host_free();
+      host_pointer = new_ptr;
+      assert(device_pointer == 0);
+    }
+
+    data_size = new_size;
+    data_width = width;
+    data_height = height;
+    data_depth = depth;
+
+    return data();
+  }
+
+  /* Take over data from an existing array. */
+  void steal_data(array<T> &from)
+  {
+    device_free();
+    host_free();
+
+    data_size = from.size();
+    data_width = 0;
+    data_height = 0;
+    data_depth = 0;
+    host_pointer = from.steal_pointer();
+    assert(device_pointer == 0);
+  }
+
+  /* Free device and host memory. */
+  void free()
+  {
+    device_free();
+    host_free();
+
+    data_size = 0;
+    data_width = 0;
+    data_height = 0;
+    data_depth = 0;
+    host_pointer = 0;
+    assert(device_pointer == 0);
+  }
+
+  size_t size()
+  {
+    return data_size;
+  }
+
+  T *data()
+  {
+    return (T *)host_pointer;
+  }
+
+  T &operator[](size_t i)
+  {
+    assert(i < data_size);
+    return data()[i];
+  }
+
+  void copy_to_device()
+  {
+    device_copy_to();
+  }
+
+  void copy_from_device(int y, int w, int h)
+  {
+    device_copy_from(y, w, h, sizeof(T));
+  }
+
+  void zero_to_device()
+  {
+    device_zero();
+  }
+
+ protected:
+  size_t size(size_t width, size_t height, size_t depth)
+  {
+    return width * ((height == 0) ? 1 : height) * ((depth == 0) ? 1 : depth);
+  }
 };
 
 /* Pixel Memory
@@ -443,28 +446,26 @@ protected:
  * Device memory to efficiently draw as pixels to the screen in interactive
  * rendering. Only copying pixels from the device is supported, not copying to. */
 
-template<typename T> class device_pixels : public device_vector<T>
-{
-public:
-	device_pixels(Device *device, const char *name)
-	: device_vector<T>(device, name, MEM_PIXELS)
-	{
-	}
-
-	void alloc_to_device(size_t width, size_t height, size_t depth = 0)
-	{
-		device_vector<T>::alloc(width, height, depth);
-
-		if(!device_memory::device_pointer) {
-			device_memory::device_alloc();
-		}
-	}
-
-	T *copy_from_device(int y, int w, int h)
-	{
-		device_memory::device_copy_from(y, w, h, sizeof(T));
-		return device_vector<T>::data();
-	}
+template<typename T> class device_pixels : public device_vector<T> {
+ public:
+  device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS)
+  {
+  }
+
+  void alloc_to_device(size_t width, size_t height, size_t depth = 0)
+  {
+    device_vector<T>::alloc(width, height, depth);
+
+    if (!device_memory::device_pointer) {
+      device_memory::device_alloc();
+    }
+  }
+
+  T *copy_from_device(int y, int w, int h)
+  {
+    device_memory::device_copy_from(y, w, h, sizeof(T));
+    return device_vector<T>::data();
+  }
 };
 
 /* Device Sub Memory
@@ -476,25 +477,24 @@ public:
  * Note: some devices require offset and size of the sub_ptr to be properly
  * aligned to device->mem_address_alingment(). */
 
-class device_sub_ptr
-{
-public:
-	device_sub_ptr(device_memory& mem, int offset, int size);
-	~device_sub_ptr();
+class device_sub_ptr {
+ public:
+  device_sub_ptr(device_memory &mem, int offset, int size);
+  ~device_sub_ptr();
 
-	device_ptr operator*() const
-	{
-		return ptr;
-	}
+  device_ptr operator*() const
+  {
+    return ptr;
+  }
 
-protected:
-	/* No copying. */
-	device_sub_ptr& operator = (const device_sub_ptr&);
+ protected:
+  /* No copying. */
+  device_sub_ptr &operator=(const device_sub_ptr &);
 
-	Device *device;
-	device_ptr ptr;
+  Device *device;
+  device_ptr ptr;
 };
 
 CCL_NAMESPACE_END
 
-#endif  /* __DEVICE_MEMORY_H__ */
+#endif /* __DEVICE_MEMORY_H__ */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index bdb7c87fa57..4a40e106115 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -31,391 +31,406 @@
 
 CCL_NAMESPACE_BEGIN
 
-class MultiDevice : public Device
-{
-public:
-	struct SubDevice {
-		explicit SubDevice(Device *device_)
-		: device(device_) {}
-
-		Device *device;
-		map<device_ptr, device_ptr> ptr_map;
-	};
-
-	list<SubDevice> devices;
-	device_ptr unique_key;
-
-	MultiDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_)
-	: Device(info, stats, profiler, background_), unique_key(1)
-	{
-		foreach(DeviceInfo& subinfo, info.multi_devices) {
-			Device *device = Device::create(subinfo, sub_stats_, profiler, background);
-
-			/* Always add CPU devices at the back since GPU devices can change
-			 * host memory pointers, which CPU uses as device pointer. */
-			if(subinfo.type == DEVICE_CPU) {
-				devices.push_back(SubDevice(device));
-			}
-			else {
-				devices.push_front(SubDevice(device));
-			}
-		}
+class MultiDevice : public Device {
+ public:
+  struct SubDevice {
+    explicit SubDevice(Device *device_) : device(device_)
+    {
+    }
+
+    Device *device;
+    map<device_ptr, device_ptr> ptr_map;
+  };
+
+  list<SubDevice> devices;
+  device_ptr unique_key;
+
+  MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
+      : Device(info, stats, profiler, background_), unique_key(1)
+  {
+    foreach (DeviceInfo &subinfo, info.multi_devices) {
+      Device *device = Device::create(subinfo, sub_stats_, profiler, background);
+
+      /* Always add CPU devices at the back since GPU devices can change
+       * host memory pointers, which CPU uses as device pointer. */
+      if (subinfo.type == DEVICE_CPU) {
+        devices.push_back(SubDevice(device));
+      }
+      else {
+        devices.push_front(SubDevice(device));
+      }
+    }
 
 #ifdef WITH_NETWORK
-		/* try to add network devices */
-		ServerDiscovery discovery(true);
-		time_sleep(1.0);
+    /* try to add network devices */
+    ServerDiscovery discovery(true);
+    time_sleep(1.0);
 
-		vector<string> servers = discovery.get_server_list();
+    vector<string> servers = discovery.get_server_list();
 
-		foreach(string& server, servers) {
-			Device *device = device_network_create(info, stats, profiler, server.c_str());
-			if(device)
-				devices.push_back(SubDevice(device));
-		}
+    foreach (string &server, servers) {
+      Device *device = device_network_create(info, stats, profiler, server.c_str());
+      if (device)
+        devices.push_back(SubDevice(device));
+    }
 #endif
-	}
-
-	~MultiDevice()
-	{
-		foreach(SubDevice& sub, devices)
-			delete sub.device;
-	}
-
-	const string& error_message()
-	{
-		foreach(SubDevice& sub, devices) {
-			if(sub.device->error_message() != "") {
-				if(error_msg == "")
-					error_msg = sub.device->error_message();
-				break;
-			}
-		}
-
-		return error_msg;
-	}
-
-	virtual bool show_samples() const
-	{
-		if(devices.size() > 1) {
-			return false;
-		}
-		return devices.front().device->show_samples();
-	}
-
-	virtual BVHLayoutMask get_bvh_layout_mask() const {
-		BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
-		foreach(const SubDevice& sub_device, devices) {
-			bvh_layout_mask &= sub_device.device->get_bvh_layout_mask();
-		}
-		return bvh_layout_mask;
-	}
-
-	bool load_kernels(const DeviceRequestedFeatures& requested_features)
-	{
-		foreach(SubDevice& sub, devices)
-			if(!sub.device->load_kernels(requested_features))
-				return false;
-
-		return true;
-	}
-
-	bool wait_for_availability(const DeviceRequestedFeatures& requested_features)
-	{
-		foreach(SubDevice& sub, devices)
-			if(!sub.device->wait_for_availability(requested_features))
-				return false;
-
-		return true;
-	}
-
-	DeviceKernelStatus get_active_kernel_switch_state()
-	{
-		DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
-
-		foreach(SubDevice& sub, devices) {
-			DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
-			switch (subresult) {
-				case DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL:
-					result = subresult;
-					break;
-
-				case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
-				case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
-					return subresult;
-
-				case DEVICE_KERNEL_USING_FEATURE_KERNEL:
-				case DEVICE_KERNEL_UNKNOWN:
-					break;
-			}
-		}
-		return result;
-	}
-
-	void mem_alloc(device_memory& mem)
-	{
-		device_ptr key = unique_key++;
-
-		foreach(SubDevice& sub, devices) {
-			mem.device = sub.device;
-			mem.device_pointer = 0;
-			mem.device_size = 0;
-
-			sub.device->mem_alloc(mem);
-			sub.ptr_map[key] = mem.device_pointer;
-		}
-
-		mem.device = this;
-		mem.device_pointer = key;
-		stats.mem_alloc(mem.device_size);
-	}
-
-	void mem_copy_to(device_memory& mem)
-	{
-		device_ptr existing_key = mem.device_pointer;
-		device_ptr key = (existing_key)? existing_key: unique_key++;
-		size_t existing_size = mem.device_size;
-
-		foreach(SubDevice& sub, devices) {
-			mem.device = sub.device;
-			mem.device_pointer = (existing_key)? sub.ptr_map[existing_key]: 0;
-			mem.device_size = existing_size;
-
-			sub.device->mem_copy_to(mem);
-			sub.ptr_map[key] = mem.device_pointer;
-		}
-
-		mem.device = this;
-		mem.device_pointer = key;
-		stats.mem_alloc(mem.device_size - existing_size);
-	}
-
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
-	{
-		device_ptr key = mem.device_pointer;
-		int i = 0, sub_h = h/devices.size();
-
-		foreach(SubDevice& sub, devices) {
-			int sy = y + i*sub_h;
-			int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h;
-
-			mem.device = sub.device;
-			mem.device_pointer = sub.ptr_map[key];
-
-			sub.device->mem_copy_from(mem, sy, w, sh, elem);
-			i++;
-		}
-
-		mem.device = this;
-		mem.device_pointer = key;
-	}
-
-	void mem_zero(device_memory& mem)
-	{
-		device_ptr existing_key = mem.device_pointer;
-		device_ptr key = (existing_key)? existing_key: unique_key++;
-		size_t existing_size = mem.device_size;
-
-		foreach(SubDevice& sub, devices) {
-			mem.device = sub.device;
-			mem.device_pointer = (existing_key)? sub.ptr_map[existing_key]: 0;
-			mem.device_size = existing_size;
-
-			sub.device->mem_zero(mem);
-			sub.ptr_map[key] = mem.device_pointer;
-		}
-
-		mem.device = this;
-		mem.device_pointer = key;
-		stats.mem_alloc(mem.device_size - existing_size);
-	}
-
-	void mem_free(device_memory& mem)
-	{
-		device_ptr key = mem.device_pointer;
-		size_t existing_size = mem.device_size;
-
-		foreach(SubDevice& sub, devices) {
-			mem.device = sub.device;
-			mem.device_pointer = sub.ptr_map[key];
-			mem.device_size = existing_size;
-
-			sub.device->mem_free(mem);
-			sub.ptr_map.erase(sub.ptr_map.find(key));
-		}
-
-		mem.device = this;
-		mem.device_pointer = 0;
-		mem.device_size = 0;
-		stats.mem_free(existing_size);
-	}
-
-	void const_copy_to(const char *name, void *host, size_t size)
-	{
-		foreach(SubDevice& sub, devices)
-			sub.device->const_copy_to(name, host, size);
-	}
-
-	void draw_pixels(
-	    device_memory& rgba, int y,
-	    int w, int h, int width, int height,
-	    int dx, int dy, int dw, int dh,
-	    bool transparent, const DeviceDrawParams &draw_params)
-	{
-		device_ptr key = rgba.device_pointer;
-		int i = 0, sub_h = h/devices.size();
-		int sub_height = height/devices.size();
-
-		foreach(SubDevice& sub, devices) {
-			int sy = y + i*sub_h;
-			int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h;
-			int sheight = (i == (int)devices.size() - 1)? height - sub_height*i: sub_height;
-			int sdy = dy + i*sub_height;
-			/* adjust math for w/width */
-
-			rgba.device_pointer = sub.ptr_map[key];
-			sub.device->draw_pixels(rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
-			i++;
-		}
-
-		rgba.device_pointer = key;
-	}
-
-	void map_tile(Device *sub_device, RenderTile& tile)
-	{
-		foreach(SubDevice& sub, devices) {
-			if(sub.device == sub_device) {
-				if(tile.buffer) tile.buffer = sub.ptr_map[tile.buffer];
-			}
-		}
-	}
-
-	int device_number(Device *sub_device)
-	{
-		int i = 0;
-
-		foreach(SubDevice& sub, devices) {
-			if(sub.device == sub_device)
-				return i;
-			i++;
-		}
-
-		return -1;
-	}
-
-	void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
-	{
-		for(int i = 0; i < 9; i++) {
-			if(!tiles[i].buffers) {
-				continue;
-			}
-
-			/* If the tile was rendered on another device, copy its memory to
-			 * to the current device now, for the duration of the denoising task.
-			 * Note that this temporarily modifies the RenderBuffers and calls
-			 * the device, so this function is not thread safe. */
-			device_vector<float> &mem = tiles[i].buffers->buffer;
-			if(mem.device != sub_device) {
-				/* Only copy from device to host once. This is faster, but
-				 * also required for the case where a CPU thread is denoising
-				 * a tile rendered on the GPU. In that case we have to avoid
-				 * overwriting the buffer being denoised by the CPU thread. */
-				if(!tiles[i].buffers->map_neighbor_copied) {
-					tiles[i].buffers->map_neighbor_copied = true;
-					mem.copy_from_device(0, mem.data_size, 1);
-				}
-
-				mem.swap_device(sub_device, 0, 0);
-
-				mem.copy_to_device();
-				tiles[i].buffer = mem.device_pointer;
-				tiles[i].device_size = mem.device_size;
-
-				mem.restore_device();
-			}
-		}
-	}
-
-	void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
-	{
-		/* Copy denoised result back to the host. */
-		device_vector<float> &mem = tiles[9].buffers->buffer;
-		mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer);
-		mem.copy_from_device(0, mem.data_size, 1);
-		mem.restore_device();
-		/* Copy denoised result to the original device. */
-		mem.copy_to_device();
-
-		for(int i = 0; i < 9; i++) {
-			if(!tiles[i].buffers) {
-				continue;
-			}
-
-			device_vector<float> &mem = tiles[i].buffers->buffer;
-			if(mem.device != sub_device) {
-				mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer);
-				sub_device->mem_free(mem);
-				mem.restore_device();
-			}
-		}
-	}
-
-	int get_split_task_count(DeviceTask& task)
-	{
-		int total_tasks = 0;
-		list<DeviceTask> tasks;
-		task.split(tasks, devices.size());
-		foreach(SubDevice& sub, devices) {
-			if(!tasks.empty()) {
-				DeviceTask subtask = tasks.front();
-				tasks.pop_front();
-
-				total_tasks += sub.device->get_split_task_count(subtask);
-			}
-		}
-		return total_tasks;
-	}
-
-	void task_add(DeviceTask& task)
-	{
-		list<DeviceTask> tasks;
-		task.split(tasks, devices.size());
-
-		foreach(SubDevice& sub, devices) {
-			if(!tasks.empty()) {
-				DeviceTask subtask = tasks.front();
-				tasks.pop_front();
-
-				if(task.buffer) subtask.buffer = sub.ptr_map[task.buffer];
-				if(task.rgba_byte) subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
-				if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
-				if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
-				if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
-
-				sub.device->task_add(subtask);
-			}
-		}
-	}
-
-	void task_wait()
-	{
-		foreach(SubDevice& sub, devices)
-			sub.device->task_wait();
-	}
-
-	void task_cancel()
-	{
-		foreach(SubDevice& sub, devices)
-			sub.device->task_cancel();
-	}
-
-protected:
-	Stats sub_stats_;
+  }
+
+  ~MultiDevice()
+  {
+    foreach (SubDevice &sub, devices)
+      delete sub.device;
+  }
+
+  const string &error_message()
+  {
+    foreach (SubDevice &sub, devices) {
+      if (sub.device->error_message() != "") {
+        if (error_msg == "")
+          error_msg = sub.device->error_message();
+        break;
+      }
+    }
+
+    return error_msg;
+  }
+
+  virtual bool show_samples() const
+  {
+    if (devices.size() > 1) {
+      return false;
+    }
+    return devices.front().device->show_samples();
+  }
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const
+  {
+    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+    foreach (const SubDevice &sub_device, devices) {
+      bvh_layout_mask &= sub_device.device->get_bvh_layout_mask();
+    }
+    return bvh_layout_mask;
+  }
+
+  bool load_kernels(const DeviceRequestedFeatures &requested_features)
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_kernels(requested_features))
+        return false;
+
+    return true;
+  }
+
+  bool wait_for_availability(const DeviceRequestedFeatures &requested_features)
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->wait_for_availability(requested_features))
+        return false;
+
+    return true;
+  }
+
+  DeviceKernelStatus get_active_kernel_switch_state()
+  {
+    DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
+
+    foreach (SubDevice &sub, devices) {
+      DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
+      switch (subresult) {
+        case DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL:
+          result = subresult;
+          break;
+
+        case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
+        case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
+          return subresult;
+
+        case DEVICE_KERNEL_USING_FEATURE_KERNEL:
+        case DEVICE_KERNEL_UNKNOWN:
+          break;
+      }
+    }
+    return result;
+  }
+
+  void mem_alloc(device_memory &mem)
+  {
+    device_ptr key = unique_key++;
+
+    foreach (SubDevice &sub, devices) {
+      mem.device = sub.device;
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      sub.device->mem_alloc(mem);
+      sub.ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size);
+  }
+
+  void mem_copy_to(device_memory &mem)
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    foreach (SubDevice &sub, devices) {
+      mem.device = sub.device;
+      mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      sub.device->mem_copy_to(mem);
+      sub.ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+  {
+    device_ptr key = mem.device_pointer;
+    int i = 0, sub_h = h / devices.size();
+
+    foreach (SubDevice &sub, devices) {
+      int sy = y + i * sub_h;
+      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+
+      mem.device = sub.device;
+      mem.device_pointer = sub.ptr_map[key];
+
+      sub.device->mem_copy_from(mem, sy, w, sh, elem);
+      i++;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+  }
+
+  void mem_zero(device_memory &mem)
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    foreach (SubDevice &sub, devices) {
+      mem.device = sub.device;
+      mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      sub.device->mem_zero(mem);
+      sub.ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_free(device_memory &mem)
+  {
+    device_ptr key = mem.device_pointer;
+    size_t existing_size = mem.device_size;
+
+    foreach (SubDevice &sub, devices) {
+      mem.device = sub.device;
+      mem.device_pointer = sub.ptr_map[key];
+      mem.device_size = existing_size;
+
+      sub.device->mem_free(mem);
+      sub.ptr_map.erase(sub.ptr_map.find(key));
+    }
+
+    mem.device = this;
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+    stats.mem_free(existing_size);
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size)
+  {
+    foreach (SubDevice &sub, devices)
+      sub.device->const_copy_to(name, host, size);
+  }
+
+  void draw_pixels(device_memory &rgba,
+                   int y,
+                   int w,
+                   int h,
+                   int width,
+                   int height,
+                   int dx,
+                   int dy,
+                   int dw,
+                   int dh,
+                   bool transparent,
+                   const DeviceDrawParams &draw_params)
+  {
+    device_ptr key = rgba.device_pointer;
+    int i = 0, sub_h = h / devices.size();
+    int sub_height = height / devices.size();
+
+    foreach (SubDevice &sub, devices) {
+      int sy = y + i * sub_h;
+      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+      int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height;
+      int sdy = dy + i * sub_height;
+      /* adjust math for w/width */
+
+      rgba.device_pointer = sub.ptr_map[key];
+      sub.device->draw_pixels(
+          rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
+      i++;
+    }
+
+    rgba.device_pointer = key;
+  }
+
+  void map_tile(Device *sub_device, RenderTile &tile)
+  {
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device) {
+        if (tile.buffer)
+          tile.buffer = sub.ptr_map[tile.buffer];
+      }
+    }
+  }
+
+  int device_number(Device *sub_device)
+  {
+    int i = 0;
+
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device)
+        return i;
+      i++;
+    }
+
+    return -1;
+  }
+
+  void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+  {
+    for (int i = 0; i < 9; i++) {
+      if (!tiles[i].buffers) {
+        continue;
+      }
+
+      /* If the tile was rendered on another device, copy its memory to
+       * to the current device now, for the duration of the denoising task.
+       * Note that this temporarily modifies the RenderBuffers and calls
+       * the device, so this function is not thread safe. */
+      device_vector<float> &mem = tiles[i].buffers->buffer;
+      if (mem.device != sub_device) {
+        /* Only copy from device to host once. This is faster, but
+         * also required for the case where a CPU thread is denoising
+         * a tile rendered on the GPU. In that case we have to avoid
+         * overwriting the buffer being denoised by the CPU thread. */
+        if (!tiles[i].buffers->map_neighbor_copied) {
+          tiles[i].buffers->map_neighbor_copied = true;
+          mem.copy_from_device(0, mem.data_size, 1);
+        }
+
+        mem.swap_device(sub_device, 0, 0);
+
+        mem.copy_to_device();
+        tiles[i].buffer = mem.device_pointer;
+        tiles[i].device_size = mem.device_size;
+
+        mem.restore_device();
+      }
+    }
+  }
+
+  void unmap_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+  {
+    /* Copy denoised result back to the host. */
+    device_vector<float> &mem = tiles[9].buffers->buffer;
+    mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer);
+    mem.copy_from_device(0, mem.data_size, 1);
+    mem.restore_device();
+    /* Copy denoised result to the original device. */
+    mem.copy_to_device();
+
+    for (int i = 0; i < 9; i++) {
+      if (!tiles[i].buffers) {
+        continue;
+      }
+
+      device_vector<float> &mem = tiles[i].buffers->buffer;
+      if (mem.device != sub_device) {
+        mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer);
+        sub_device->mem_free(mem);
+        mem.restore_device();
+      }
+    }
+  }
+
+  int get_split_task_count(DeviceTask &task)
+  {
+    int total_tasks = 0;
+    list<DeviceTask> tasks;
+    task.split(tasks, devices.size());
+    foreach (SubDevice &sub, devices) {
+      if (!tasks.empty()) {
+        DeviceTask subtask = tasks.front();
+        tasks.pop_front();
+
+        total_tasks += sub.device->get_split_task_count(subtask);
+      }
+    }
+    return total_tasks;
+  }
+
+  void task_add(DeviceTask &task)
+  {
+    list<DeviceTask> tasks;
+    task.split(tasks, devices.size());
+
+    foreach (SubDevice &sub, devices) {
+      if (!tasks.empty()) {
+        DeviceTask subtask = tasks.front();
+        tasks.pop_front();
+
+        if (task.buffer)
+          subtask.buffer = sub.ptr_map[task.buffer];
+        if (task.rgba_byte)
+          subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
+        if (task.rgba_half)
+          subtask.rgba_half = sub.ptr_map[task.rgba_half];
+        if (task.shader_input)
+          subtask.shader_input = sub.ptr_map[task.shader_input];
+        if (task.shader_output)
+          subtask.shader_output = sub.ptr_map[task.shader_output];
+
+        sub.device->task_add(subtask);
+      }
+    }
+  }
+
+  void task_wait()
+  {
+    foreach (SubDevice &sub, devices)
+      sub.device->task_wait();
+  }
+
+  void task_cancel()
+  {
+    foreach (SubDevice &sub, devices)
+      sub.device->task_cancel();
+  }
+
+ protected:
+  Stats sub_stats_;
 };
 
-Device *device_multi_create(DeviceInfo& info, Stats &stats, Profiler& profiler, bool background)
+Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
 {
-	return new MultiDevice(info, stats, profiler, background);
+  return new MultiDevice(info, stats, profiler, background);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 6736480e95a..80334ad8f22 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -33,767 +33,776 @@ typedef map<device_ptr, DataVector> DataMap;
 typedef vector<RenderTile> TileList;
 
 /* search a list of tiles and find the one that matches the passed render tile */
-static TileList::iterator tile_list_find(TileList& tile_list, RenderTile& tile)
+static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile)
 {
-	for(TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
-		if(tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
-			return it;
-	return tile_list.end();
+  for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
+    if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
+      return it;
+  return tile_list.end();
 }
 
-class NetworkDevice : public Device
-{
-public:
-	boost::asio::io_service io_service;
-	tcp::socket socket;
-	device_ptr mem_counter;
-	DeviceTask the_task; /* todo: handle multiple tasks */
-
-	thread_mutex rpc_lock;
-
-	virtual bool show_samples() const
-	{
-		return false;
-	}
-
-	NetworkDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, const char *address)
-	: Device(info, stats, profiler, true), socket(io_service)
-	{
-		error_func = NetworkError();
-		stringstream portstr;
-		portstr << SERVER_PORT;
-
-		tcp::resolver resolver(io_service);
-		tcp::resolver::query query(address, portstr.str());
-		tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
-		tcp::resolver::iterator end;
+class NetworkDevice : public Device {
+ public:
+  boost::asio::io_service io_service;
+  tcp::socket socket;
+  device_ptr mem_counter;
+  DeviceTask the_task; /* todo: handle multiple tasks */
+
+  thread_mutex rpc_lock;
+
+  virtual bool show_samples() const
+  {
+    return false;
+  }
+
+  NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address)
+      : Device(info, stats, profiler, true), socket(io_service)
+  {
+    error_func = NetworkError();
+    stringstream portstr;
+    portstr << SERVER_PORT;
+
+    tcp::resolver resolver(io_service);
+    tcp::resolver::query query(address, portstr.str());
+    tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
+    tcp::resolver::iterator end;
+
+    boost::system::error_code error = boost::asio::error::host_not_found;
+    while (error && endpoint_iterator != end) {
+      socket.close();
+      socket.connect(*endpoint_iterator++, error);
+    }
 
-		boost::system::error_code error = boost::asio::error::host_not_found;
-		while(error && endpoint_iterator != end)
-		{
-			socket.close();
-			socket.connect(*endpoint_iterator++, error);
-		}
-
-		if(error)
-			error_func.network_error(error.message());
+    if (error)
+      error_func.network_error(error.message());
 
-		mem_counter = 0;
-	}
+    mem_counter = 0;
+  }
 
-	~NetworkDevice()
-	{
-		RPCSend snd(socket, &error_func, "stop");
-		snd.write();
-	}
-
-	virtual BVHLayoutMask get_bvh_layout_mask() const {
-		return BVH_LAYOUT_BVH2;
-	}
-
-	void mem_alloc(device_memory& mem)
-	{
-		if(mem.name) {
-			VLOG(1) << "Buffer allocate: " << mem.name << ", "
-				    << string_human_readable_number(mem.memory_size()) << " bytes. ("
-				    << string_human_readable_size(mem.memory_size()) << ")";
-		}
-
-		thread_scoped_lock lock(rpc_lock);
+  ~NetworkDevice()
+  {
+    RPCSend snd(socket, &error_func, "stop");
+    snd.write();
+  }
 
-		mem.device_pointer = ++mem_counter;
+  virtual BVHLayoutMask get_bvh_layout_mask() const
+  {
+    return BVH_LAYOUT_BVH2;
+  }
 
-		RPCSend snd(socket, &error_func, "mem_alloc");
-		snd.add(mem);
-		snd.write();
-	}
+  void mem_alloc(device_memory &mem)
+  {
+    if (mem.name) {
+      VLOG(1) << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")";
+    }
 
-	void mem_copy_to(device_memory& mem)
-	{
-		thread_scoped_lock lock(rpc_lock);
+    thread_scoped_lock lock(rpc_lock);
 
-		RPCSend snd(socket, &error_func, "mem_copy_to");
+    mem.device_pointer = ++mem_counter;
 
-		snd.add(mem);
-		snd.write();
-		snd.write_buffer(mem.host_pointer, mem.memory_size());
-	}
+    RPCSend snd(socket, &error_func, "mem_alloc");
+    snd.add(mem);
+    snd.write();
+  }
 
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
-	{
-		thread_scoped_lock lock(rpc_lock);
+  void mem_copy_to(device_memory &mem)
+  {
+    thread_scoped_lock lock(rpc_lock);
 
-		size_t data_size = mem.memory_size();
+    RPCSend snd(socket, &error_func, "mem_copy_to");
 
-		RPCSend snd(socket, &error_func, "mem_copy_from");
+    snd.add(mem);
+    snd.write();
+    snd.write_buffer(mem.host_pointer, mem.memory_size());
+  }
 
-		snd.add(mem);
-		snd.add(y);
-		snd.add(w);
-		snd.add(h);
-		snd.add(elem);
-		snd.write();
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+  {
+    thread_scoped_lock lock(rpc_lock);
 
-		RPCReceive rcv(socket, &error_func);
-		rcv.read_buffer(mem.host_pointer, data_size);
-	}
+    size_t data_size = mem.memory_size();
 
-	void mem_zero(device_memory& mem)
-	{
-		thread_scoped_lock lock(rpc_lock);
+    RPCSend snd(socket, &error_func, "mem_copy_from");
 
-		RPCSend snd(socket, &error_func, "mem_zero");
+    snd.add(mem);
+    snd.add(y);
+    snd.add(w);
+    snd.add(h);
+    snd.add(elem);
+    snd.write();
 
-		snd.add(mem);
-		snd.write();
-	}
+    RPCReceive rcv(socket, &error_func);
+    rcv.read_buffer(mem.host_pointer, data_size);
+  }
 
-	void mem_free(device_memory& mem)
-	{
-		if(mem.device_pointer) {
-			thread_scoped_lock lock(rpc_lock);
-
-			RPCSend snd(socket, &error_func, "mem_free");
-
-			snd.add(mem);
-			snd.write();
-
-			mem.device_pointer = 0;
-		}
-	}
+  void mem_zero(device_memory &mem)
+  {
+    thread_scoped_lock lock(rpc_lock);
 
-	void const_copy_to(const char *name, void *host, size_t size)
-	{
-		thread_scoped_lock lock(rpc_lock);
+    RPCSend snd(socket, &error_func, "mem_zero");
 
-		RPCSend snd(socket, &error_func, "const_copy_to");
+    snd.add(mem);
+    snd.write();
+  }
 
-		string name_string(name);
-
-		snd.add(name_string);
-		snd.add(size);
-		snd.write();
-		snd.write_buffer(host, size);
-	}
-
-	bool load_kernels(const DeviceRequestedFeatures& requested_features)
-	{
-		if(error_func.have_error())
-			return false;
-
-		thread_scoped_lock lock(rpc_lock);
-
-		RPCSend snd(socket, &error_func, "load_kernels");
-		snd.add(requested_features.experimental);
-		snd.add(requested_features.max_closure);
-		snd.add(requested_features.max_nodes_group);
-		snd.add(requested_features.nodes_features);
-		snd.write();
-
-		bool result;
-		RPCReceive rcv(socket, &error_func);
-		rcv.read(result);
-
-		return result;
-	}
-
-	void task_add(DeviceTask& task)
-	{
-		thread_scoped_lock lock(rpc_lock);
-
-		the_task = task;
-
-		RPCSend snd(socket, &error_func, "task_add");
-		snd.add(task);
-		snd.write();
-	}
-
-	void task_wait()
-	{
-		thread_scoped_lock lock(rpc_lock);
-
-		RPCSend snd(socket, &error_func, "task_wait");
-		snd.write();
-
-		lock.unlock();
-
-		TileList the_tiles;
-
-		/* todo: run this threaded for connecting to multiple clients */
-		for(;;) {
-			if(error_func.have_error())
-				break;
-
-			RenderTile tile;
-
-			lock.lock();
-			RPCReceive rcv(socket, &error_func);
-
-			if(rcv.name == "acquire_tile") {
-				lock.unlock();
-
-				/* todo: watch out for recursive calls! */
-				if(the_task.acquire_tile(this, tile)) { /* write return as bool */
-					the_tiles.push_back(tile);
-
-					lock.lock();
-					RPCSend snd(socket, &error_func, "acquire_tile");
-					snd.add(tile);
-					snd.write();
-					lock.unlock();
-				}
-				else {
-					lock.lock();
-					RPCSend snd(socket, &error_func, "acquire_tile_none");
-					snd.write();
-					lock.unlock();
-				}
-			}
-			else if(rcv.name == "release_tile") {
-				rcv.read(tile);
-				lock.unlock();
-
-				TileList::iterator it = tile_list_find(the_tiles, tile);
-				if(it != the_tiles.end()) {
-					tile.buffers = it->buffers;
-					the_tiles.erase(it);
-				}
-
-				assert(tile.buffers != NULL);
-
-				the_task.release_tile(tile);
-
-				lock.lock();
-				RPCSend snd(socket, &error_func, "release_tile");
-				snd.write();
-				lock.unlock();
-			}
-			else if(rcv.name == "task_wait_done") {
-				lock.unlock();
-				break;
-			}
-			else
-				lock.unlock();
-		}
-	}
-
-	void task_cancel()
-	{
-		thread_scoped_lock lock(rpc_lock);
-		RPCSend snd(socket, &error_func, "task_cancel");
-		snd.write();
-	}
-
-	int get_split_task_count(DeviceTask&)
-	{
-		return 1;
-	}
-
-private:
-	NetworkError error_func;
+  void mem_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      thread_scoped_lock lock(rpc_lock);
+
+      RPCSend snd(socket, &error_func, "mem_free");
+
+      snd.add(mem);
+      snd.write();
+
+      mem.device_pointer = 0;
+    }
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size)
+  {
+    thread_scoped_lock lock(rpc_lock);
+
+    RPCSend snd(socket, &error_func, "const_copy_to");
+
+    string name_string(name);
+
+    snd.add(name_string);
+    snd.add(size);
+    snd.write();
+    snd.write_buffer(host, size);
+  }
+
+  bool load_kernels(const DeviceRequestedFeatures &requested_features)
+  {
+    if (error_func.have_error())
+      return false;
+
+    thread_scoped_lock lock(rpc_lock);
+
+    RPCSend snd(socket, &error_func, "load_kernels");
+    snd.add(requested_features.experimental);
+    snd.add(requested_features.max_closure);
+    snd.add(requested_features.max_nodes_group);
+    snd.add(requested_features.nodes_features);
+    snd.write();
+
+    bool result;
+    RPCReceive rcv(socket, &error_func);
+    rcv.read(result);
+
+    return result;
+  }
+
+  void task_add(DeviceTask &task)
+  {
+    thread_scoped_lock lock(rpc_lock);
+
+    the_task = task;
+
+    RPCSend snd(socket, &error_func, "task_add");
+    snd.add(task);
+    snd.write();
+  }
+
+  void task_wait()
+  {
+    thread_scoped_lock lock(rpc_lock);
+
+    RPCSend snd(socket, &error_func, "task_wait");
+    snd.write();
+
+    lock.unlock();
+
+    TileList the_tiles;
+
+    /* todo: run this threaded for connecting to multiple clients */
+    for (;;) {
+      if (error_func.have_error())
+        break;
+
+      RenderTile tile;
+
+      lock.lock();
+      RPCReceive rcv(socket, &error_func);
+
+      if (rcv.name == "acquire_tile") {
+        lock.unlock();
+
+        /* todo: watch out for recursive calls! */
+        if (the_task.acquire_tile(this, tile)) { /* write return as bool */
+          the_tiles.push_back(tile);
+
+          lock.lock();
+          RPCSend snd(socket, &error_func, "acquire_tile");
+          snd.add(tile);
+          snd.write();
+          lock.unlock();
+        }
+        else {
+          lock.lock();
+          RPCSend snd(socket, &error_func, "acquire_tile_none");
+          snd.write();
+          lock.unlock();
+        }
+      }
+      else if (rcv.name == "release_tile") {
+        rcv.read(tile);
+        lock.unlock();
+
+        TileList::iterator it = tile_list_find(the_tiles, tile);
+        if (it != the_tiles.end()) {
+          tile.buffers = it->buffers;
+          the_tiles.erase(it);
+        }
+
+        assert(tile.buffers != NULL);
+
+        the_task.release_tile(tile);
+
+        lock.lock();
+        RPCSend snd(socket, &error_func, "release_tile");
+        snd.write();
+        lock.unlock();
+      }
+      else if (rcv.name == "task_wait_done") {
+        lock.unlock();
+        break;
+      }
+      else
+        lock.unlock();
+    }
+  }
+
+  void task_cancel()
+  {
+    thread_scoped_lock lock(rpc_lock);
+    RPCSend snd(socket, &error_func, "task_cancel");
+    snd.write();
+  }
+
+  int get_split_task_count(DeviceTask &)
+  {
+    return 1;
+  }
+
+ private:
+  NetworkError error_func;
 };
 
-Device *device_network_create(DeviceInfo& info, Stats &stats, Profiler &profiler, const char *address)
+Device *device_network_create(DeviceInfo &info,
+                              Stats &stats,
+                              Profiler &profiler,
+                              const char *address)
 {
-	return new NetworkDevice(info, stats, profiler, address);
+  return new NetworkDevice(info, stats, profiler, address);
 }
 
-void device_network_info(vector<DeviceInfo>& devices)
+void device_network_info(vector<DeviceInfo> &devices)
 {
-	DeviceInfo info;
+  DeviceInfo info;
 
-	info.type = DEVICE_NETWORK;
-	info.description = "Network Device";
-	info.id = "NETWORK";
-	info.num = 0;
+  info.type = DEVICE_NETWORK;
+  info.description = "Network Device";
+  info.id = "NETWORK";
+  info.num = 0;
 
-	/* todo: get this info from device */
-	info.has_volume_decoupled = false;
-	info.has_osl = false;
+  /* todo: get this info from device */
+  info.has_volume_decoupled = false;
+  info.has_osl = false;
 
-	devices.push_back(info);
+  devices.push_back(info);
 }
 
 class DeviceServer {
-public:
-	thread_mutex rpc_lock;
-
-	void network_error(const string &message) {
-		error_func.network_error(message);
-	}
-
-	bool have_error() { return error_func.have_error(); }
-
-	DeviceServer(Device *device_, tcp::socket& socket_)
-	: device(device_), socket(socket_), stop(false), blocked_waiting(false)
-	{
-		error_func = NetworkError();
-	}
-
-	void listen()
-	{
-		/* receive remote function calls */
-		for(;;) {
-			listen_step();
-
-			if(stop)
-				break;
-		}
-	}
-
-protected:
-	void listen_step()
-	{
-		thread_scoped_lock lock(rpc_lock);
-		RPCReceive rcv(socket, &error_func);
-
-		if(rcv.name == "stop")
-			stop = true;
-		else
-			process(rcv, lock);
-	}
-
-	/* create a memory buffer for a device buffer and insert it into mem_data */
-	DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
-	{
-		/* create a new DataVector and insert it into mem_data */
-		pair<DataMap::iterator,bool> data_ins = mem_data.insert(
-		        DataMap::value_type(client_pointer, DataVector()));
-
-		/* make sure it was a unique insertion */
-		assert(data_ins.second);
-
-		/* get a reference to the inserted vector */
-		DataVector &data_v = data_ins.first->second;
-
-		/* size the vector */
-		data_v.resize(data_size);
-
-		return data_v;
-	}
-
-	DataVector &data_vector_find(device_ptr client_pointer)
-	{
-		DataMap::iterator i = mem_data.find(client_pointer);
-		assert(i != mem_data.end());
-		return i->second;
-	}
-
-	/* setup mapping and reverse mapping of client_pointer<->real_pointer */
-	void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
-	{
-		pair<PtrMap::iterator,bool> mapins;
-
-		/* insert mapping from client pointer to our real device pointer */
-		mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
-		assert(mapins.second);
-
-		/* insert reverse mapping from real our device pointer to client pointer */
-		mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
-		assert(mapins.second);
-	}
-
-	device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
-	{
-		PtrMap::iterator i = ptr_map.find(client_pointer);
-		assert(i != ptr_map.end());
-		return i->second;
-	}
-
-	device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
-	{
-		PtrMap::iterator i = ptr_map.find(client_pointer);
-		assert(i != ptr_map.end());
-
-		device_ptr result = i->second;
-
-		/* erase the mapping */
-		ptr_map.erase(i);
-
-		/* erase the reverse mapping */
-		PtrMap::iterator irev = ptr_imap.find(result);
-		assert(irev != ptr_imap.end());
-		ptr_imap.erase(irev);
-
-		/* erase the data vector */
-		DataMap::iterator idata = mem_data.find(client_pointer);
-		assert(idata != mem_data.end());
-		mem_data.erase(idata);
-
-		return result;
-	}
-
-	/* note that the lock must be already acquired upon entry.
-	 * This is necessary because the caller often peeks at
-	 * the header and delegates control to here when it doesn't
-	 * specifically handle the current RPC.
-	 * The lock must be unlocked before returning */
-	void process(RPCReceive& rcv, thread_scoped_lock &lock)
-	{
-		if(rcv.name == "mem_alloc") {
-			string name;
-			network_device_memory mem(device);
-			rcv.read(mem, name);
-			lock.unlock();
-
-			/* Allocate host side data buffer. */
-			size_t data_size = mem.memory_size();
-			device_ptr client_pointer = mem.device_pointer;
-
-			DataVector &data_v = data_vector_insert(client_pointer, data_size);
-			mem.host_pointer = (data_size)? (void*)&(data_v[0]): 0;
-
-			/* Perform the allocation on the actual device. */
-			device->mem_alloc(mem);
-
-			/* Store a mapping to/from client_pointer and real device pointer. */
-			pointer_mapping_insert(client_pointer, mem.device_pointer);
-		}
-		else if(rcv.name == "mem_copy_to") {
-			string name;
-			network_device_memory mem(device);
-			rcv.read(mem, name);
-			lock.unlock();
-
-			size_t data_size = mem.memory_size();
-			device_ptr client_pointer = mem.device_pointer;
-
-			if(client_pointer) {
-				/* Lookup existing host side data buffer. */
-				DataVector &data_v = data_vector_find(client_pointer);
-				mem.host_pointer = (void*)&data_v[0];
-
-				/* Translate the client pointer to a real device pointer. */
-				mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-			}
-			else {
-				/* Allocate host side data buffer. */
-				DataVector &data_v = data_vector_insert(client_pointer, data_size);
-				mem.host_pointer = (data_size)? (void*)&(data_v[0]): 0;
-			}
-
-			/* Copy data from network into memory buffer. */
-			rcv.read_buffer((uint8_t*)mem.host_pointer, data_size);
-
-			/* Copy the data from the memory buffer to the device buffer. */
-			device->mem_copy_to(mem);
-
-			if(!client_pointer) {
-				/* Store a mapping to/from client_pointer and real device pointer. */
-				pointer_mapping_insert(client_pointer, mem.device_pointer);
-			}
-		}
-		else if(rcv.name == "mem_copy_from") {
-			string name;
-			network_device_memory mem(device);
-			int y, w, h, elem;
-
-			rcv.read(mem, name);
-			rcv.read(y);
-			rcv.read(w);
-			rcv.read(h);
-			rcv.read(elem);
-
-			device_ptr client_pointer = mem.device_pointer;
-			mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-
-			DataVector &data_v = data_vector_find(client_pointer);
-
-			mem.host_pointer = (device_ptr)&(data_v[0]);
-
-			device->mem_copy_from(mem, y, w, h, elem);
-
-			size_t data_size = mem.memory_size();
-
-			RPCSend snd(socket, &error_func, "mem_copy_from");
-			snd.write();
-			snd.write_buffer((uint8_t*)mem.host_pointer, data_size);
-			lock.unlock();
-		}
-		else if(rcv.name == "mem_zero") {
-			string name;
-			network_device_memory mem(device);
-			rcv.read(mem, name);
-			lock.unlock();
-
-			size_t data_size = mem.memory_size();
-			device_ptr client_pointer = mem.device_pointer;
-
-			if(client_pointer) {
-				/* Lookup existing host side data buffer. */
-				DataVector &data_v = data_vector_find(client_pointer);
-				mem.host_pointer = (void*)&data_v[0];
-
-				/* Translate the client pointer to a real device pointer. */
-				mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-			}
-			else {
-				/* Allocate host side data buffer. */
-				DataVector &data_v = data_vector_insert(client_pointer, data_size);
-				mem.host_pointer = (void*)? (device_ptr)&(data_v[0]): 0;
-			}
-
-			/* Zero memory. */
-			device->mem_zero(mem);
-
-			if(!client_pointer) {
-				/* Store a mapping to/from client_pointer and real device pointer. */
-				pointer_mapping_insert(client_pointer, mem.device_pointer);
-			}
-		}
-		else if(rcv.name == "mem_free") {
-			string name;
-			network_device_memory mem(device);
-
-			rcv.read(mem, name);
-			lock.unlock();
-
-			device_ptr client_pointer = mem.device_pointer;
-
-			mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
-
-			device->mem_free(mem);
-		}
-		else if(rcv.name == "const_copy_to") {
-			string name_string;
-			size_t size;
-
-			rcv.read(name_string);
-			rcv.read(size);
-
-			vector<char> host_vector(size);
-			rcv.read_buffer(&host_vector[0], size);
-			lock.unlock();
-
-			device->const_copy_to(name_string.c_str(), &host_vector[0], size);
-		}
-		else if(rcv.name == "load_kernels") {
-			DeviceRequestedFeatures requested_features;
-			rcv.read(requested_features.experimental);
-			rcv.read(requested_features.max_closure);
-			rcv.read(requested_features.max_nodes_group);
-			rcv.read(requested_features.nodes_features);
-
-			bool result;
-			result = device->load_kernels(requested_features);
-			RPCSend snd(socket, &error_func, "load_kernels");
-			snd.add(result);
-			snd.write();
-			lock.unlock();
-		}
-		else if(rcv.name == "task_add") {
-			DeviceTask task;
-
-			rcv.read(task);
-			lock.unlock();
-
-			if(task.buffer)
-				task.buffer = device_ptr_from_client_pointer(task.buffer);
-
-			if(task.rgba_half)
-				task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
-
-			if(task.rgba_byte)
-				task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
-
-			if(task.shader_input)
-				task.shader_input = device_ptr_from_client_pointer(task.shader_input);
-
-			if(task.shader_output)
-				task.shader_output = device_ptr_from_client_pointer(task.shader_output);
-
-			task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
-			task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
-			task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this);
-			task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
-			task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
-
-			device->task_add(task);
-		}
-		else if(rcv.name == "task_wait") {
-			lock.unlock();
-
-			blocked_waiting = true;
-			device->task_wait();
-			blocked_waiting = false;
-
-			lock.lock();
-			RPCSend snd(socket, &error_func, "task_wait_done");
-			snd.write();
-			lock.unlock();
-		}
-		else if(rcv.name == "task_cancel") {
-			lock.unlock();
-			device->task_cancel();
-		}
-		else if(rcv.name == "acquire_tile") {
-			AcquireEntry entry;
-			entry.name = rcv.name;
-			rcv.read(entry.tile);
-			acquire_queue.push_back(entry);
-			lock.unlock();
-		}
-		else if(rcv.name == "acquire_tile_none") {
-			AcquireEntry entry;
-			entry.name = rcv.name;
-			acquire_queue.push_back(entry);
-			lock.unlock();
-		}
-		else if(rcv.name == "release_tile") {
-			AcquireEntry entry;
-			entry.name = rcv.name;
-			acquire_queue.push_back(entry);
-			lock.unlock();
-		}
-		else {
-			cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
-			lock.unlock();
-		}
-	}
-
-	bool task_acquire_tile(Device *, RenderTile& tile)
-	{
-		thread_scoped_lock acquire_lock(acquire_mutex);
-
-		bool result = false;
-
-		RPCSend snd(socket, &error_func, "acquire_tile");
-		snd.write();
-
-		do {
-			if(blocked_waiting)
-				listen_step();
-
-			/* todo: avoid busy wait loop */
-			thread_scoped_lock lock(rpc_lock);
-
-			if(!acquire_queue.empty()) {
-				AcquireEntry entry = acquire_queue.front();
-				acquire_queue.pop_front();
-
-				if(entry.name == "acquire_tile") {
-					tile = entry.tile;
-
-					if(tile.buffer) tile.buffer = ptr_map[tile.buffer];
-
-					result = true;
-					break;
-				}
-				else if(entry.name == "acquire_tile_none") {
-					break;
-				}
-				else {
-					cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
-				}
-			}
-		} while(acquire_queue.empty() && !stop && !have_error());
-
-		return result;
-	}
-
-	void task_update_progress_sample()
-	{
-		; /* skip */
-	}
-
-	void task_update_tile_sample(RenderTile&)
-	{
-		; /* skip */
-	}
-
-	void task_release_tile(RenderTile& tile)
-	{
-		thread_scoped_lock acquire_lock(acquire_mutex);
-
-		if(tile.buffer) tile.buffer = ptr_imap[tile.buffer];
-
-		{
-			thread_scoped_lock lock(rpc_lock);
-			RPCSend snd(socket, &error_func, "release_tile");
-			snd.add(tile);
-			snd.write();
-			lock.unlock();
-		}
-
-		do {
-			if(blocked_waiting)
-				listen_step();
-
-			/* todo: avoid busy wait loop */
-			thread_scoped_lock lock(rpc_lock);
-
-			if(!acquire_queue.empty()) {
-				AcquireEntry entry = acquire_queue.front();
-				acquire_queue.pop_front();
-
-				if(entry.name == "release_tile") {
-					lock.unlock();
-					break;
-				}
-				else {
-					cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
-				}
-			}
-		} while(acquire_queue.empty() && !stop);
-	}
-
-	bool task_get_cancel()
-	{
-		return false;
-	}
-
-	/* properties */
-	Device *device;
-	tcp::socket& socket;
-
-	/* mapping of remote to local pointer */
-	PtrMap ptr_map;
-	PtrMap ptr_imap;
-	DataMap mem_data;
-
-	struct AcquireEntry {
-		string name;
-		RenderTile tile;
-	};
-
-	thread_mutex acquire_mutex;
-	list<AcquireEntry> acquire_queue;
-
-	bool stop;
-	bool blocked_waiting;
-private:
-	NetworkError error_func;
-
-	/* todo: free memory and device (osl) on network error */
-
+ public:
+  thread_mutex rpc_lock;
+
+  void network_error(const string &message)
+  {
+    error_func.network_error(message);
+  }
+
+  bool have_error()
+  {
+    return error_func.have_error();
+  }
+
+  DeviceServer(Device *device_, tcp::socket &socket_)
+      : device(device_), socket(socket_), stop(false), blocked_waiting(false)
+  {
+    error_func = NetworkError();
+  }
+
+  void listen()
+  {
+    /* receive remote function calls */
+    for (;;) {
+      listen_step();
+
+      if (stop)
+        break;
+    }
+  }
+
+ protected:
+  void listen_step()
+  {
+    thread_scoped_lock lock(rpc_lock);
+    RPCReceive rcv(socket, &error_func);
+
+    if (rcv.name == "stop")
+      stop = true;
+    else
+      process(rcv, lock);
+  }
+
+  /* create a memory buffer for a device buffer and insert it into mem_data */
+  DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
+  {
+    /* create a new DataVector and insert it into mem_data */
+    pair<DataMap::iterator, bool> data_ins = mem_data.insert(
+        DataMap::value_type(client_pointer, DataVector()));
+
+    /* make sure it was a unique insertion */
+    assert(data_ins.second);
+
+    /* get a reference to the inserted vector */
+    DataVector &data_v = data_ins.first->second;
+
+    /* size the vector */
+    data_v.resize(data_size);
+
+    return data_v;
+  }
+
+  DataVector &data_vector_find(device_ptr client_pointer)
+  {
+    DataMap::iterator i = mem_data.find(client_pointer);
+    assert(i != mem_data.end());
+    return i->second;
+  }
+
+  /* setup mapping and reverse mapping of client_pointer<->real_pointer */
+  void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
+  {
+    pair<PtrMap::iterator, bool> mapins;
+
+    /* insert mapping from client pointer to our real device pointer */
+    mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
+    assert(mapins.second);
+
+    /* insert reverse mapping from real our device pointer to client pointer */
+    mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
+    assert(mapins.second);
+  }
+
+  device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
+  {
+    PtrMap::iterator i = ptr_map.find(client_pointer);
+    assert(i != ptr_map.end());
+    return i->second;
+  }
+
+  device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
+  {
+    PtrMap::iterator i = ptr_map.find(client_pointer);
+    assert(i != ptr_map.end());
+
+    device_ptr result = i->second;
+
+    /* erase the mapping */
+    ptr_map.erase(i);
+
+    /* erase the reverse mapping */
+    PtrMap::iterator irev = ptr_imap.find(result);
+    assert(irev != ptr_imap.end());
+    ptr_imap.erase(irev);
+
+    /* erase the data vector */
+    DataMap::iterator idata = mem_data.find(client_pointer);
+    assert(idata != mem_data.end());
+    mem_data.erase(idata);
+
+    return result;
+  }
+
+  /* note that the lock must be already acquired upon entry.
+   * This is necessary because the caller often peeks at
+   * the header and delegates control to here when it doesn't
+   * specifically handle the current RPC.
+   * The lock must be unlocked before returning */
+  void process(RPCReceive &rcv, thread_scoped_lock &lock)
+  {
+    if (rcv.name == "mem_alloc") {
+      string name;
+      network_device_memory mem(device);
+      rcv.read(mem, name);
+      lock.unlock();
+
+      /* Allocate host side data buffer. */
+      size_t data_size = mem.memory_size();
+      device_ptr client_pointer = mem.device_pointer;
+
+      DataVector &data_v = data_vector_insert(client_pointer, data_size);
+      mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
+
+      /* Perform the allocation on the actual device. */
+      device->mem_alloc(mem);
+
+      /* Store a mapping to/from client_pointer and real device pointer. */
+      pointer_mapping_insert(client_pointer, mem.device_pointer);
+    }
+    else if (rcv.name == "mem_copy_to") {
+      string name;
+      network_device_memory mem(device);
+      rcv.read(mem, name);
+      lock.unlock();
+
+      size_t data_size = mem.memory_size();
+      device_ptr client_pointer = mem.device_pointer;
+
+      if (client_pointer) {
+        /* Lookup existing host side data buffer. */
+        DataVector &data_v = data_vector_find(client_pointer);
+        mem.host_pointer = (void *)&data_v[0];
+
+        /* Translate the client pointer to a real device pointer. */
+        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
+      }
+      else {
+        /* Allocate host side data buffer. */
+        DataVector &data_v = data_vector_insert(client_pointer, data_size);
+        mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
+      }
+
+      /* Copy data from network into memory buffer. */
+      rcv.read_buffer((uint8_t *)mem.host_pointer, data_size);
+
+      /* Copy the data from the memory buffer to the device buffer. */
+      device->mem_copy_to(mem);
+
+      if (!client_pointer) {
+        /* Store a mapping to/from client_pointer and real device pointer. */
+        pointer_mapping_insert(client_pointer, mem.device_pointer);
+      }
+    }
+    else if (rcv.name == "mem_copy_from") {
+      string name;
+      network_device_memory mem(device);
+      int y, w, h, elem;
+
+      rcv.read(mem, name);
+      rcv.read(y);
+      rcv.read(w);
+      rcv.read(h);
+      rcv.read(elem);
+
+      device_ptr client_pointer = mem.device_pointer;
+      mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
+
+      DataVector &data_v = data_vector_find(client_pointer);
+
+      mem.host_pointer = (device_ptr) & (data_v[0]);
+
+      device->mem_copy_from(mem, y, w, h, elem);
+
+      size_t data_size = mem.memory_size();
+
+      RPCSend snd(socket, &error_func, "mem_copy_from");
+      snd.write();
+      snd.write_buffer((uint8_t *)mem.host_pointer, data_size);
+      lock.unlock();
+    }
+    else if (rcv.name == "mem_zero") {
+      string name;
+      network_device_memory mem(device);
+      rcv.read(mem, name);
+      lock.unlock();
+
+      size_t data_size = mem.memory_size();
+      device_ptr client_pointer = mem.device_pointer;
+
+      if (client_pointer) {
+        /* Lookup existing host side data buffer. */
+        DataVector &data_v = data_vector_find(client_pointer);
+        mem.host_pointer = (void *)&data_v[0];
+
+        /* Translate the client pointer to a real device pointer. */
+        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
+      }
+      else {
+        /* Allocate host side data buffer. */
+        DataVector &data_v = data_vector_insert(client_pointer, data_size);
+        mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0;
+      }
+
+      /* Zero memory. */
+      device->mem_zero(mem);
+
+      if (!client_pointer) {
+        /* Store a mapping to/from client_pointer and real device pointer. */
+        pointer_mapping_insert(client_pointer, mem.device_pointer);
+      }
+    }
+    else if (rcv.name == "mem_free") {
+      string name;
+      network_device_memory mem(device);
+
+      rcv.read(mem, name);
+      lock.unlock();
+
+      device_ptr client_pointer = mem.device_pointer;
+
+      mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
+
+      device->mem_free(mem);
+    }
+    else if (rcv.name == "const_copy_to") {
+      string name_string;
+      size_t size;
+
+      rcv.read(name_string);
+      rcv.read(size);
+
+      vector<char> host_vector(size);
+      rcv.read_buffer(&host_vector[0], size);
+      lock.unlock();
+
+      device->const_copy_to(name_string.c_str(), &host_vector[0], size);
+    }
+    else if (rcv.name == "load_kernels") {
+      DeviceRequestedFeatures requested_features;
+      rcv.read(requested_features.experimental);
+      rcv.read(requested_features.max_closure);
+      rcv.read(requested_features.max_nodes_group);
+      rcv.read(requested_features.nodes_features);
+
+      bool result;
+      result = device->load_kernels(requested_features);
+      RPCSend snd(socket, &error_func, "load_kernels");
+      snd.add(result);
+      snd.write();
+      lock.unlock();
+    }
+    else if (rcv.name == "task_add") {
+      DeviceTask task;
+
+      rcv.read(task);
+      lock.unlock();
+
+      if (task.buffer)
+        task.buffer = device_ptr_from_client_pointer(task.buffer);
+
+      if (task.rgba_half)
+        task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
+
+      if (task.rgba_byte)
+        task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
+
+      if (task.shader_input)
+        task.shader_input = device_ptr_from_client_pointer(task.shader_input);
+
+      if (task.shader_output)
+        task.shader_output = device_ptr_from_client_pointer(task.shader_output);
+
+      task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
+      task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
+      task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample,
+                                                  this);
+      task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
+      task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
+
+      device->task_add(task);
+    }
+    else if (rcv.name == "task_wait") {
+      lock.unlock();
+
+      blocked_waiting = true;
+      device->task_wait();
+      blocked_waiting = false;
+
+      lock.lock();
+      RPCSend snd(socket, &error_func, "task_wait_done");
+      snd.write();
+      lock.unlock();
+    }
+    else if (rcv.name == "task_cancel") {
+      lock.unlock();
+      device->task_cancel();
+    }
+    else if (rcv.name == "acquire_tile") {
+      AcquireEntry entry;
+      entry.name = rcv.name;
+      rcv.read(entry.tile);
+      acquire_queue.push_back(entry);
+      lock.unlock();
+    }
+    else if (rcv.name == "acquire_tile_none") {
+      AcquireEntry entry;
+      entry.name = rcv.name;
+      acquire_queue.push_back(entry);
+      lock.unlock();
+    }
+    else if (rcv.name == "release_tile") {
+      AcquireEntry entry;
+      entry.name = rcv.name;
+      acquire_queue.push_back(entry);
+      lock.unlock();
+    }
+    else {
+      cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
+      lock.unlock();
+    }
+  }
+
+  bool task_acquire_tile(Device *, RenderTile &tile)
+  {
+    thread_scoped_lock acquire_lock(acquire_mutex);
+
+    bool result = false;
+
+    RPCSend snd(socket, &error_func, "acquire_tile");
+    snd.write();
+
+    do {
+      if (blocked_waiting)
+        listen_step();
+
+      /* todo: avoid busy wait loop */
+      thread_scoped_lock lock(rpc_lock);
+
+      if (!acquire_queue.empty()) {
+        AcquireEntry entry = acquire_queue.front();
+        acquire_queue.pop_front();
+
+        if (entry.name == "acquire_tile") {
+          tile = entry.tile;
+
+          if (tile.buffer)
+            tile.buffer = ptr_map[tile.buffer];
+
+          result = true;
+          break;
+        }
+        else if (entry.name == "acquire_tile_none") {
+          break;
+        }
+        else {
+          cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
+        }
+      }
+    } while (acquire_queue.empty() && !stop && !have_error());
+
+    return result;
+  }
+
+  void task_update_progress_sample()
+  {
+    ; /* skip */
+  }
+
+  void task_update_tile_sample(RenderTile &)
+  {
+    ; /* skip */
+  }
+
+  void task_release_tile(RenderTile &tile)
+  {
+    thread_scoped_lock acquire_lock(acquire_mutex);
+
+    if (tile.buffer)
+      tile.buffer = ptr_imap[tile.buffer];
+
+    {
+      thread_scoped_lock lock(rpc_lock);
+      RPCSend snd(socket, &error_func, "release_tile");
+      snd.add(tile);
+      snd.write();
+      lock.unlock();
+    }
+
+    do {
+      if (blocked_waiting)
+        listen_step();
+
+      /* todo: avoid busy wait loop */
+      thread_scoped_lock lock(rpc_lock);
+
+      if (!acquire_queue.empty()) {
+        AcquireEntry entry = acquire_queue.front();
+        acquire_queue.pop_front();
+
+        if (entry.name == "release_tile") {
+          lock.unlock();
+          break;
+        }
+        else {
+          cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
+        }
+      }
+    } while (acquire_queue.empty() && !stop);
+  }
+
+  bool task_get_cancel()
+  {
+    return false;
+  }
+
+  /* properties */
+  Device *device;
+  tcp::socket &socket;
+
+  /* mapping of remote to local pointer */
+  PtrMap ptr_map;
+  PtrMap ptr_imap;
+  DataMap mem_data;
+
+  struct AcquireEntry {
+    string name;
+    RenderTile tile;
+  };
+
+  thread_mutex acquire_mutex;
+  list<AcquireEntry> acquire_queue;
+
+  bool stop;
+  bool blocked_waiting;
+
+ private:
+  NetworkError error_func;
+
+  /* todo: free memory and device (osl) on network error */
 };
 
 void Device::server_run()
 {
-	try {
-		/* starts thread that responds to discovery requests */
-		ServerDiscovery discovery;
-
-		for(;;) {
-			/* accept connection */
-			boost::asio::io_service io_service;
-			tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
-
-			tcp::socket socket(io_service);
-			acceptor.accept(socket);
-
-			string remote_address = socket.remote_endpoint().address().to_string();
-			printf("Connected to remote client at: %s\n", remote_address.c_str());
-
-			DeviceServer server(this, socket);
-			server.listen();
-
-			printf("Disconnected.\n");
-		}
-	}
-	catch(exception& e) {
-		fprintf(stderr, "Network server exception: %s\n", e.what());
-	}
+  try {
+    /* starts thread that responds to discovery requests */
+    ServerDiscovery discovery;
+
+    for (;;) {
+      /* accept connection */
+      boost::asio::io_service io_service;
+      tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
+
+      tcp::socket socket(io_service);
+      acceptor.accept(socket);
+
+      string remote_address = socket.remote_endpoint().address().to_string();
+      printf("Connected to remote client at: %s\n", remote_address.c_str());
+
+      DeviceServer server(this, socket);
+      server.listen();
+
+      printf("Disconnected.\n");
+    }
+  }
+  catch (exception &e) {
+    fprintf(stderr, "Network server exception: %s\n", e.what());
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index 67626ae177f..5b69b815cc6 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -19,35 +19,35 @@
 
 #ifdef WITH_NETWORK
 
-#include <boost/archive/text_iarchive.hpp>
-#include <boost/archive/text_oarchive.hpp>
-#include <boost/archive/binary_iarchive.hpp>
-#include <boost/archive/binary_oarchive.hpp>
-#include <boost/array.hpp>
-#include <boost/asio.hpp>
-#include <boost/bind.hpp>
-#include <boost/serialization/vector.hpp>
-#include <boost/thread.hpp>
-
-#include <iostream>
-#include <sstream>
-#include <deque>
-
-#include "render/buffers.h"
-
-#include "util/util_foreach.h"
-#include "util/util_list.h"
-#include "util/util_map.h"
-#include "util/util_param.h"
-#include "util/util_string.h"
+#  include <boost/archive/text_iarchive.hpp>
+#  include <boost/archive/text_oarchive.hpp>
+#  include <boost/archive/binary_iarchive.hpp>
+#  include <boost/archive/binary_oarchive.hpp>
+#  include <boost/array.hpp>
+#  include <boost/asio.hpp>
+#  include <boost/bind.hpp>
+#  include <boost/serialization/vector.hpp>
+#  include <boost/thread.hpp>
+
+#  include <iostream>
+#  include <sstream>
+#  include <deque>
+
+#  include "render/buffers.h"
+
+#  include "util/util_foreach.h"
+#  include "util/util_list.h"
+#  include "util/util_map.h"
+#  include "util/util_param.h"
+#  include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
-using std::cout;
 using std::cerr;
+using std::cout;
+using std::exception;
 using std::hex;
 using std::setw;
-using std::exception;
 
 using boost::asio::ip::tcp;
 
@@ -56,436 +56,435 @@ static const int DISCOVER_PORT = 5121;
 static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP";
 static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP";
 
-#if 0
+#  if 0
 typedef boost::archive::text_oarchive o_archive;
 typedef boost::archive::text_iarchive i_archive;
-#else
+#  else
 typedef boost::archive::binary_oarchive o_archive;
 typedef boost::archive::binary_iarchive i_archive;
-#endif
+#  endif
 
 /* Serialization of device memory */
 
-class network_device_memory : public device_memory
-{
-public:
-	network_device_memory(Device *device)
-	: device_memory(device, "", MEM_READ_ONLY)
-	{
-	}
+class network_device_memory : public device_memory {
+ public:
+  network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY)
+  {
+  }
 
-	~network_device_memory()
-	{
-		device_pointer = 0;
-	};
+  ~network_device_memory()
+  {
+    device_pointer = 0;
+  };
 
-	vector<char> local_data;
+  vector<char> local_data;
 };
 
 /* Common netowrk error function / object for both DeviceNetwork and DeviceServer*/
 class NetworkError {
-public:
-	NetworkError() {
-		error = "";
-		error_count = 0;
-	}
-
-	~NetworkError() {}
-
-	void network_error(const string& message) {
-		error = message;
-		error_count += 1;
-	}
-
-	bool have_error() {
-		return true ? error_count > 0 : false;
-	}
-
-private:
-	string error;
-	int error_count;
+ public:
+  NetworkError()
+  {
+    error = "";
+    error_count = 0;
+  }
+
+  ~NetworkError()
+  {
+  }
+
+  void network_error(const string &message)
+  {
+    error = message;
+    error_count += 1;
+  }
+
+  bool have_error()
+  {
+    return true ? error_count > 0 : false;
+  }
+
+ private:
+  string error;
+  int error_count;
 };
 
-
 /* Remote procedure call Send */
 
 class RPCSend {
-public:
-	RPCSend(tcp::socket& socket_, NetworkError* e, const string& name_ = "")
-	: name(name_), socket(socket_), archive(archive_stream), sent(false)
-	{
-		archive & name_;
-		error_func = e;
-		fprintf(stderr, "rpc send %s\n", name.c_str());
-	}
-
-	~RPCSend()
-	{
-	}
-
-	void add(const device_memory& mem)
-	{
-		archive & mem.data_type & mem.data_elements & mem.data_size;
-		archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer;
-		archive & mem.type & string(mem.name);
-		archive & mem.interpolation & mem.extension;
-		archive & mem.device_pointer;
-	}
-
-	template<typename T> void add(const T& data)
-	{
-		archive & data;
-	}
-
-	void add(const DeviceTask& task)
-	{
-		int type = (int)task.type;
-		archive & type & task.x & task.y & task.w & task.h;
-		archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
-		archive & task.offset & task.stride;
-		archive & task.shader_input & task.shader_output & task.shader_eval_type;
-		archive & task.shader_x & task.shader_w;
-		archive & task.need_finish_queue;
-	}
-
-	void add(const RenderTile& tile)
-	{
-		archive & tile.x & tile.y & tile.w & tile.h;
-		archive & tile.start_sample & tile.num_samples & tile.sample;
-		archive & tile.resolution & tile.offset & tile.stride;
-		archive & tile.buffer;
-	}
-
-	void write()
-	{
-		boost::system::error_code error;
-
-		/* get string from stream */
-		string archive_str = archive_stream.str();
-
-		/* first send fixed size header with size of following data */
-		ostringstream header_stream;
-		header_stream << setw(8) << hex << archive_str.size();
-		string header_str = header_stream.str();
-
-		boost::asio::write(socket,
-			boost::asio::buffer(header_str),
-			boost::asio::transfer_all(), error);
-
-		if(error.value())
-			error_func->network_error(error.message());
-
-		/* then send actual data */
-		boost::asio::write(socket,
-			boost::asio::buffer(archive_str),
-			boost::asio::transfer_all(), error);
-
-		if(error.value())
-			error_func->network_error(error.message());
-
-		sent = true;
-	}
-
-	void write_buffer(void *buffer, size_t size)
-	{
-		boost::system::error_code error;
-
-		boost::asio::write(socket,
-			boost::asio::buffer(buffer, size),
-			boost::asio::transfer_all(), error);
-
-		if(error.value())
-			error_func->network_error(error.message());
-	}
-
-protected:
-	string name;
-	tcp::socket& socket;
-	ostringstream archive_stream;
-	o_archive archive;
-	bool sent;
-	NetworkError *error_func;
+ public:
+  RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "")
+      : name(name_), socket(socket_), archive(archive_stream), sent(false)
+  {
+    archive &name_;
+    error_func = e;
+    fprintf(stderr, "rpc send %s\n", name.c_str());
+  }
+
+  ~RPCSend()
+  {
+  }
+
+  void add(const device_memory &mem)
+  {
+    archive &mem.data_type &mem.data_elements &mem.data_size;
+    archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
+    archive &mem.type &string(mem.name);
+    archive &mem.interpolation &mem.extension;
+    archive &mem.device_pointer;
+  }
+
+  template<typename T> void add(const T &data)
+  {
+    archive &data;
+  }
+
+  void add(const DeviceTask &task)
+  {
+    int type = (int)task.type;
+    archive &type &task.x &task.y &task.w &task.h;
+    archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
+    archive &task.offset &task.stride;
+    archive &task.shader_input &task.shader_output &task.shader_eval_type;
+    archive &task.shader_x &task.shader_w;
+    archive &task.need_finish_queue;
+  }
+
+  void add(const RenderTile &tile)
+  {
+    archive &tile.x &tile.y &tile.w &tile.h;
+    archive &tile.start_sample &tile.num_samples &tile.sample;
+    archive &tile.resolution &tile.offset &tile.stride;
+    archive &tile.buffer;
+  }
+
+  void write()
+  {
+    boost::system::error_code error;
+
+    /* get string from stream */
+    string archive_str = archive_stream.str();
+
+    /* first send fixed size header with size of following data */
+    ostringstream header_stream;
+    header_stream << setw(8) << hex << archive_str.size();
+    string header_str = header_stream.str();
+
+    boost::asio::write(
+        socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error);
+
+    if (error.value())
+      error_func->network_error(error.message());
+
+    /* then send actual data */
+    boost::asio::write(
+        socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error);
+
+    if (error.value())
+      error_func->network_error(error.message());
+
+    sent = true;
+  }
+
+  void write_buffer(void *buffer, size_t size)
+  {
+    boost::system::error_code error;
+
+    boost::asio::write(
+        socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error);
+
+    if (error.value())
+      error_func->network_error(error.message());
+  }
+
+ protected:
+  string name;
+  tcp::socket &socket;
+  ostringstream archive_stream;
+  o_archive archive;
+  bool sent;
+  NetworkError *error_func;
 };
 
 /* Remote procedure call Receive */
 
 class RPCReceive {
-public:
-	RPCReceive(tcp::socket& socket_, NetworkError* e )
-	: socket(socket_), archive_stream(NULL), archive(NULL)
-	{
-		error_func = e;
-		/* read head with fixed size */
-		vector<char> header(8);
-		boost::system::error_code error;
-		size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
-
-		if(error.value()) {
-			error_func->network_error(error.message());
-		}
-
-		/* verify if we got something */
-		if(len == header.size()) {
-			/* decode header */
-			string header_str(&header[0], header.size());
-			istringstream header_stream(header_str);
-
-			size_t data_size;
-
-			if((header_stream >> hex >> data_size)) {
-
-				vector<char> data(data_size);
-				size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
-
-				if(error.value())
-					error_func->network_error(error.message());
-
-
-				if(len == data_size) {
-					archive_str = (data.size())? string(&data[0], data.size()): string("");
-
-					archive_stream = new istringstream(archive_str);
-					archive = new i_archive(*archive_stream);
-
-					*archive & name;
-					fprintf(stderr, "rpc receive %s\n", name.c_str());
-				}
-				else {
-					error_func->network_error("Network receive error: data size doesn't match header");
-				}
-			}
-			else {
-				error_func->network_error("Network receive error: can't decode data size from header");
-			}
-		}
-		else {
-			error_func->network_error("Network receive error: invalid header size");
-		}
-	}
-
-	~RPCReceive()
-	{
-		delete archive;
-		delete archive_stream;
-	}
-
-	void read(network_device_memory& mem, string& name)
-	{
-		*archive & mem.data_type & mem.data_elements & mem.data_size;
-		*archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer;
-		*archive & mem.type & name;
-		*archive & mem.interpolation & mem.extension;
-		*archive & mem.device_pointer;
-
-		mem.name = name.c_str();
-		mem.host_pointer = 0;
-
-		/* Can't transfer OpenGL texture over network. */
-		if(mem.type == MEM_PIXELS) {
-			mem.type = MEM_READ_WRITE;
-		}
-	}
-
-	template<typename T> void read(T& data)
-	{
-		*archive & data;
-	}
-
-	void read_buffer(void *buffer, size_t size)
-	{
-		boost::system::error_code error;
-		size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
-
-		if(error.value()) {
-			error_func->network_error(error.message());
-		}
-
-		if(len != size)
-			cout << "Network receive error: buffer size doesn't match expected size\n";
-	}
-
-	void read(DeviceTask& task)
-	{
-		int type;
-
-		*archive & type & task.x & task.y & task.w & task.h;
-		*archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
-		*archive & task.offset & task.stride;
-		*archive & task.shader_input & task.shader_output & task.shader_eval_type;
-		*archive & task.shader_x & task.shader_w;
-		*archive & task.need_finish_queue;
-
-		task.type = (DeviceTask::Type)type;
-	}
-
-	void read(RenderTile& tile)
-	{
-		*archive & tile.x & tile.y & tile.w & tile.h;
-		*archive & tile.start_sample & tile.num_samples & tile.sample;
-		*archive & tile.resolution & tile.offset & tile.stride;
-		*archive & tile.buffer;
-
-		tile.buffers = NULL;
-	}
-
-	string name;
-
-protected:
-	tcp::socket& socket;
-	string archive_str;
-	istringstream *archive_stream;
-	i_archive *archive;
-	NetworkError *error_func;
+ public:
+  RPCReceive(tcp::socket &socket_, NetworkError *e)
+      : socket(socket_), archive_stream(NULL), archive(NULL)
+  {
+    error_func = e;
+    /* read head with fixed size */
+    vector<char> header(8);
+    boost::system::error_code error;
+    size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
+
+    if (error.value()) {
+      error_func->network_error(error.message());
+    }
+
+    /* verify if we got something */
+    if (len == header.size()) {
+      /* decode header */
+      string header_str(&header[0], header.size());
+      istringstream header_stream(header_str);
+
+      size_t data_size;
+
+      if ((header_stream >> hex >> data_size)) {
+
+        vector<char> data(data_size);
+        size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
+
+        if (error.value())
+          error_func->network_error(error.message());
+
+        if (len == data_size) {
+          archive_str = (data.size()) ? string(&data[0], data.size()) : string("");
+
+          archive_stream = new istringstream(archive_str);
+          archive = new i_archive(*archive_stream);
+
+          *archive &name;
+          fprintf(stderr, "rpc receive %s\n", name.c_str());
+        }
+        else {
+          error_func->network_error("Network receive error: data size doesn't match header");
+        }
+      }
+      else {
+        error_func->network_error("Network receive error: can't decode data size from header");
+      }
+    }
+    else {
+      error_func->network_error("Network receive error: invalid header size");
+    }
+  }
+
+  ~RPCReceive()
+  {
+    delete archive;
+    delete archive_stream;
+  }
+
+  void read(network_device_memory &mem, string &name)
+  {
+    *archive &mem.data_type &mem.data_elements &mem.data_size;
+    *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
+    *archive &mem.type &name;
+    *archive &mem.interpolation &mem.extension;
+    *archive &mem.device_pointer;
+
+    mem.name = name.c_str();
+    mem.host_pointer = 0;
+
+    /* Can't transfer OpenGL texture over network. */
+    if (mem.type == MEM_PIXELS) {
+      mem.type = MEM_READ_WRITE;
+    }
+  }
+
+  template<typename T> void read(T &data)
+  {
+    *archive &data;
+  }
+
+  void read_buffer(void *buffer, size_t size)
+  {
+    boost::system::error_code error;
+    size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
+
+    if (error.value()) {
+      error_func->network_error(error.message());
+    }
+
+    if (len != size)
+      cout << "Network receive error: buffer size doesn't match expected size\n";
+  }
+
+  void read(DeviceTask &task)
+  {
+    int type;
+
+    *archive &type &task.x &task.y &task.w &task.h;
+    *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
+    *archive &task.offset &task.stride;
+    *archive &task.shader_input &task.shader_output &task.shader_eval_type;
+    *archive &task.shader_x &task.shader_w;
+    *archive &task.need_finish_queue;
+
+    task.type = (DeviceTask::Type)type;
+  }
+
+  void read(RenderTile &tile)
+  {
+    *archive &tile.x &tile.y &tile.w &tile.h;
+    *archive &tile.start_sample &tile.num_samples &tile.sample;
+    *archive &tile.resolution &tile.offset &tile.stride;
+    *archive &tile.buffer;
+
+    tile.buffers = NULL;
+  }
+
+  string name;
+
+ protected:
+  tcp::socket &socket;
+  string archive_str;
+  istringstream *archive_stream;
+  i_archive *archive;
+  NetworkError *error_func;
 };
 
 /* Server auto discovery */
 
 class ServerDiscovery {
-public:
-	explicit ServerDiscovery(bool discover = false)
-	: listen_socket(io_service), collect_servers(false)
-	{
-		/* setup listen socket */
-		listen_endpoint.address(boost::asio::ip::address_v4::any());
-		listen_endpoint.port(DISCOVER_PORT);
-
-		listen_socket.open(listen_endpoint.protocol());
-
-		boost::asio::socket_base::reuse_address option(true);
-		listen_socket.set_option(option);
-
-		listen_socket.bind(listen_endpoint);
-
-		/* setup receive callback */
-		async_receive();
-
-		/* start server discovery */
-		if(discover) {
-			collect_servers = true;
-			servers.clear();
-
-			broadcast_message(DISCOVER_REQUEST_MSG);
-		}
-
-		/* start thread */
-		work = new boost::asio::io_service::work(io_service);
-		thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
-	}
-
-	~ServerDiscovery()
-	{
-		io_service.stop();
-		thread->join();
-		delete thread;
-		delete work;
-	}
-
-	vector<string> get_server_list()
-	{
-		vector<string> result;
-
-		mutex.lock();
-		result = vector<string>(servers.begin(), servers.end());
-		mutex.unlock();
-
-		return result;
-	}
-
-private:
-	void handle_receive_from(const boost::system::error_code& error, size_t size)
-	{
-		if(error) {
-			cout << "Server discovery receive error: " << error.message() << "\n";
-			return;
-		}
-
-		if(size > 0) {
-			string msg = string(receive_buffer, size);
-
-			/* handle incoming message */
-			if(collect_servers) {
-				if(msg == DISCOVER_REPLY_MSG) {
-					string address = receive_endpoint.address().to_string();
-
-					mutex.lock();
-
-					/* add address if it's not already in the list */
-					bool found = std::find(servers.begin(), servers.end(),
-					                       address) != servers.end();
-
-					if(!found)
-						servers.push_back(address);
-
-					mutex.unlock();
-				}
-			}
-			else {
-				/* reply to request */
-				if(msg == DISCOVER_REQUEST_MSG)
-					broadcast_message(DISCOVER_REPLY_MSG);
-			}
-		}
-
-		async_receive();
-	}
-
-	void async_receive()
-	{
-		listen_socket.async_receive_from(
-			boost::asio::buffer(receive_buffer), receive_endpoint,
-			boost::bind(&ServerDiscovery::handle_receive_from, this,
-			boost::asio::placeholders::error, boost::asio::placeholders::bytes_transferred));
-	}
-
-	void broadcast_message(const string& msg)
-	{
-		/* setup broadcast socket */
-		boost::asio::ip::udp::socket socket(io_service);
-
-		socket.open(boost::asio::ip::udp::v4());
-
-		boost::asio::socket_base::broadcast option(true);
-		socket.set_option(option);
-
-		boost::asio::ip::udp::endpoint broadcast_endpoint(
-			boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
-
-		/* broadcast message */
-		socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
-	}
-
-	/* network service and socket */
-	boost::asio::io_service io_service;
-	boost::asio::ip::udp::endpoint listen_endpoint;
-	boost::asio::ip::udp::socket listen_socket;
-
-	/* threading */
-	boost::thread *thread;
-	boost::asio::io_service::work *work;
-	boost::mutex mutex;
-
-	/* buffer and endpoint for receiving messages */
-	char receive_buffer[256];
-	boost::asio::ip::udp::endpoint receive_endpoint;
-
-	// os, version, devices, status, host name, group name, ip as far as fields go
-	struct ServerInfo {
-		string cycles_version;
-		string os;
-		int device_count;
-		string status;
-		string host_name;
-		string group_name;
-		string host_addr;
-	};
-
-	/* collection of server addresses in list */
-	bool collect_servers;
-	vector<string> servers;
+ public:
+  explicit ServerDiscovery(bool discover = false)
+      : listen_socket(io_service), collect_servers(false)
+  {
+    /* setup listen socket */
+    listen_endpoint.address(boost::asio::ip::address_v4::any());
+    listen_endpoint.port(DISCOVER_PORT);
+
+    listen_socket.open(listen_endpoint.protocol());
+
+    boost::asio::socket_base::reuse_address option(true);
+    listen_socket.set_option(option);
+
+    listen_socket.bind(listen_endpoint);
+
+    /* setup receive callback */
+    async_receive();
+
+    /* start server discovery */
+    if (discover) {
+      collect_servers = true;
+      servers.clear();
+
+      broadcast_message(DISCOVER_REQUEST_MSG);
+    }
+
+    /* start thread */
+    work = new boost::asio::io_service::work(io_service);
+    thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
+  }
+
+  ~ServerDiscovery()
+  {
+    io_service.stop();
+    thread->join();
+    delete thread;
+    delete work;
+  }
+
+  vector<string> get_server_list()
+  {
+    vector<string> result;
+
+    mutex.lock();
+    result = vector<string>(servers.begin(), servers.end());
+    mutex.unlock();
+
+    return result;
+  }
+
+ private:
+  void handle_receive_from(const boost::system::error_code &error, size_t size)
+  {
+    if (error) {
+      cout << "Server discovery receive error: " << error.message() << "\n";
+      return;
+    }
+
+    if (size > 0) {
+      string msg = string(receive_buffer, size);
+
+      /* handle incoming message */
+      if (collect_servers) {
+        if (msg == DISCOVER_REPLY_MSG) {
+          string address = receive_endpoint.address().to_string();
+
+          mutex.lock();
+
+          /* add address if it's not already in the list */
+          bool found = std::find(servers.begin(), servers.end(), address) != servers.end();
+
+          if (!found)
+            servers.push_back(address);
+
+          mutex.unlock();
+        }
+      }
+      else {
+        /* reply to request */
+        if (msg == DISCOVER_REQUEST_MSG)
+          broadcast_message(DISCOVER_REPLY_MSG);
+      }
+    }
+
+    async_receive();
+  }
+
+  void async_receive()
+  {
+    listen_socket.async_receive_from(boost::asio::buffer(receive_buffer),
+                                     receive_endpoint,
+                                     boost::bind(&ServerDiscovery::handle_receive_from,
+                                                 this,
+                                                 boost::asio::placeholders::error,
+                                                 boost::asio::placeholders::bytes_transferred));
+  }
+
+  void broadcast_message(const string &msg)
+  {
+    /* setup broadcast socket */
+    boost::asio::ip::udp::socket socket(io_service);
+
+    socket.open(boost::asio::ip::udp::v4());
+
+    boost::asio::socket_base::broadcast option(true);
+    socket.set_option(option);
+
+    boost::asio::ip::udp::endpoint broadcast_endpoint(
+        boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
+
+    /* broadcast message */
+    socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
+  }
+
+  /* network service and socket */
+  boost::asio::io_service io_service;
+  boost::asio::ip::udp::endpoint listen_endpoint;
+  boost::asio::ip::udp::socket listen_socket;
+
+  /* threading */
+  boost::thread *thread;
+  boost::asio::io_service::work *work;
+  boost::mutex mutex;
+
+  /* buffer and endpoint for receiving messages */
+  char receive_buffer[256];
+  boost::asio::ip::udp::endpoint receive_endpoint;
+
+  // os, version, devices, status, host name, group name, ip as far as fields go
+  struct ServerInfo {
+    string cycles_version;
+    string os;
+    int device_count;
+    string status;
+    string host_name;
+    string group_name;
+    string host_addr;
+  };
+
+  /* collection of server addresses in list */
+  bool collect_servers;
+  vector<string> servers;
 };
 
 CCL_NAMESPACE_END
 
 #endif
 
-#endif  /* __DEVICE_NETWORK_H__ */
+#endif /* __DEVICE_NETWORK_H__ */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 4cefaa217f1..99a8d2438d6 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -16,218 +16,211 @@
 
 #ifdef WITH_OPENCL
 
-#include "device/opencl/opencl.h"
+#  include "device/opencl/opencl.h"
 
-#include "device/device_intern.h"
+#  include "device/device_intern.h"
 
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-#include "util/util_set.h"
-#include "util/util_string.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_set.h"
+#  include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
-Device *device_opencl_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
+Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
 {
-	return opencl_create_split_device(info, stats, profiler, background);
+  return opencl_create_split_device(info, stats, profiler, background);
 }
 
 bool device_opencl_init()
 {
-	static bool initialized = false;
-	static bool result = false;
-
-	if(initialized)
-		return result;
-
-	initialized = true;
-
-	if(OpenCLInfo::device_type() != 0) {
-		int clew_result = clewInit();
-		if(clew_result == CLEW_SUCCESS) {
-			VLOG(1) << "CLEW initialization succeeded.";
-			result = true;
-		}
-		else {
-			VLOG(1) << "CLEW initialization failed: "
-			        << ((clew_result == CLEW_ERROR_ATEXIT_FAILED)
-			            ? "Error setting up atexit() handler"
-			            : "Error opening the library");
-		}
-	}
-	else {
-		VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
-		result = false;
-	}
-
-	return result;
+  static bool initialized = false;
+  static bool result = false;
+
+  if (initialized)
+    return result;
+
+  initialized = true;
+
+  if (OpenCLInfo::device_type() != 0) {
+    int clew_result = clewInit();
+    if (clew_result == CLEW_SUCCESS) {
+      VLOG(1) << "CLEW initialization succeeded.";
+      result = true;
+    }
+    else {
+      VLOG(1) << "CLEW initialization failed: "
+              << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
+                                                              "Error opening the library");
+    }
+  }
+  else {
+    VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
+    result = false;
+  }
+
+  return result;
 }
 
-
 static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
 {
-#ifdef _WIN32
-	__try {
-		return clGetPlatformIDs(0, NULL, num_platforms);
-	}
-	__except(EXCEPTION_EXECUTE_HANDLER) {
-		/* Ignore crashes inside the OpenCL driver and hope we can
-		 * survive even with corrupted OpenCL installs. */
-		fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
-	}
-
-	*num_platforms = 0;
-	return CL_DEVICE_NOT_FOUND;
-#else
-	return clGetPlatformIDs(0, NULL, num_platforms);
-#endif
+#  ifdef _WIN32
+  __try {
+    return clGetPlatformIDs(0, NULL, num_platforms);
+  }
+  __except (EXCEPTION_EXECUTE_HANDLER) {
+    /* Ignore crashes inside the OpenCL driver and hope we can
+     * survive even with corrupted OpenCL installs. */
+    fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
+  }
+
+  *num_platforms = 0;
+  return CL_DEVICE_NOT_FOUND;
+#  else
+  return clGetPlatformIDs(0, NULL, num_platforms);
+#  endif
 }
 
-void device_opencl_info(vector<DeviceInfo>& devices)
+void device_opencl_info(vector<DeviceInfo> &devices)
 {
-	cl_uint num_platforms = 0;
-	device_opencl_get_num_platforms_safe(&num_platforms);
-	if(num_platforms == 0) {
-		return;
-	}
-
-	vector<OpenCLPlatformDevice> usable_devices;
-	OpenCLInfo::get_usable_devices(&usable_devices);
-	/* Devices are numbered consecutively across platforms. */
-	int num_devices = 0;
-	set<string> unique_ids;
-	foreach(OpenCLPlatformDevice& platform_device, usable_devices) {
-		/* Compute unique ID for persistent user preferences. */
-		const string& platform_name = platform_device.platform_name;
-		const string& device_name = platform_device.device_name;
-		string hardware_id = platform_device.hardware_id;
-		if(hardware_id == "") {
-			hardware_id = string_printf("ID_%d", num_devices);
-		}
-		string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
-
-		/* Hardware ID might not be unique, add device number in that case. */
-		if(unique_ids.find(id) != unique_ids.end()) {
-			id += string_printf("_ID_%d", num_devices);
-		}
-		unique_ids.insert(id);
-
-		/* Create DeviceInfo. */
-		DeviceInfo info;
-		info.type = DEVICE_OPENCL;
-		info.description = string_remove_trademark(string(device_name));
-		info.num = num_devices;
-		/* We don't know if it's used for display, but assume it is. */
-		info.display_device = true;
-		info.use_split_kernel = true;
-		info.has_volume_decoupled = false;
-		info.id = id;
-
-		/* Check OpenCL extensions */
-		info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
-
-		devices.push_back(info);
-		num_devices++;
-	}
+  cl_uint num_platforms = 0;
+  device_opencl_get_num_platforms_safe(&num_platforms);
+  if (num_platforms == 0) {
+    return;
+  }
+
+  vector<OpenCLPlatformDevice> usable_devices;
+  OpenCLInfo::get_usable_devices(&usable_devices);
+  /* Devices are numbered consecutively across platforms. */
+  int num_devices = 0;
+  set<string> unique_ids;
+  foreach (OpenCLPlatformDevice &platform_device, usable_devices) {
+    /* Compute unique ID for persistent user preferences. */
+    const string &platform_name = platform_device.platform_name;
+    const string &device_name = platform_device.device_name;
+    string hardware_id = platform_device.hardware_id;
+    if (hardware_id == "") {
+      hardware_id = string_printf("ID_%d", num_devices);
+    }
+    string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
+
+    /* Hardware ID might not be unique, add device number in that case. */
+    if (unique_ids.find(id) != unique_ids.end()) {
+      id += string_printf("_ID_%d", num_devices);
+    }
+    unique_ids.insert(id);
+
+    /* Create DeviceInfo. */
+    DeviceInfo info;
+    info.type = DEVICE_OPENCL;
+    info.description = string_remove_trademark(string(device_name));
+    info.num = num_devices;
+    /* We don't know if it's used for display, but assume it is. */
+    info.display_device = true;
+    info.use_split_kernel = true;
+    info.has_volume_decoupled = false;
+    info.id = id;
+
+    /* Check OpenCL extensions */
+    info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
+
+    devices.push_back(info);
+    num_devices++;
+  }
 }
 
 string device_opencl_capabilities()
 {
-	if(OpenCLInfo::device_type() == 0) {
-		return "All OpenCL devices are forced to be OFF";
-	}
-	string result = "";
-	string error_msg = "";  /* Only used by opencl_assert(), but in the future
-	                         * it could also be nicely reported to the console.
-	                         */
-	cl_uint num_platforms = 0;
-	opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
-	if(num_platforms == 0) {
-		return "No OpenCL platforms found\n";
-	}
-	result += string_printf("Number of platforms: %u\n", num_platforms);
-
-	vector<cl_platform_id> platform_ids;
-	platform_ids.resize(num_platforms);
-	opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
-
-	typedef char cl_string[1024];
-
-#define APPEND_INFO(func, id, name, what, type) \
-	do { \
-		type data; \
-		memset(&data, 0, sizeof(data)); \
-		opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
-		result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
-	} while(false)
-#define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
-	do { \
-		char data[1024] = "\0"; \
-		size_t length = 0; \
-		if(func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \
-			if(length != 0 && data[0] != '\0') { \
-				result += string_printf("%s: %s\n", name, data); \
-			} \
-		} \
-	} while(false)
-#define APPEND_PLATFORM_INFO(id, name, what, type) \
-	APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
-#define APPEND_DEVICE_INFO(id, name, what, type) \
-	APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
-#define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
-	APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
-
-	vector<cl_device_id> device_ids;
-	for(cl_uint platform = 0; platform < num_platforms; ++platform) {
-		cl_platform_id platform_id = platform_ids[platform];
-
-		result += string_printf("Platform #%u\n", platform);
-
-		APPEND_PLATFORM_INFO(platform_id, "Name", CL_PLATFORM_NAME, cl_string);
-		APPEND_PLATFORM_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR, cl_string);
-		APPEND_PLATFORM_INFO(platform_id, "Version", CL_PLATFORM_VERSION, cl_string);
-		APPEND_PLATFORM_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE, cl_string);
-		APPEND_PLATFORM_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS, cl_string);
-
-		cl_uint num_devices = 0;
-		opencl_assert(clGetDeviceIDs(platform_ids[platform],
-		                             CL_DEVICE_TYPE_ALL,
-		                             0,
-		                             NULL,
-		                             &num_devices));
-		result += string_printf("\tNumber of devices: %u\n", num_devices);
-
-		device_ids.resize(num_devices);
-		opencl_assert(clGetDeviceIDs(platform_ids[platform],
-		                             CL_DEVICE_TYPE_ALL,
-		                             num_devices,
-		                             &device_ids[0],
-		                             NULL));
-		for(cl_uint device = 0; device < num_devices; ++device) {
-			cl_device_id device_id = device_ids[device];
-
-			result += string_printf("\t\tDevice: #%u\n", device);
-
-			APPEND_DEVICE_INFO(device_id, "Name", CL_DEVICE_NAME, cl_string);
-			APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
-			APPEND_DEVICE_INFO(device_id, "Vendor", CL_DEVICE_VENDOR, cl_string);
-			APPEND_DEVICE_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION, cl_string);
-			APPEND_DEVICE_INFO(device_id, "Profile", CL_DEVICE_PROFILE, cl_string);
-			APPEND_DEVICE_INFO(device_id, "Version", CL_DEVICE_VERSION, cl_string);
-			APPEND_DEVICE_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS, cl_string);
-			APPEND_DEVICE_INFO(device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
-			APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
-			APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
-		}
-	}
-
-#undef APPEND_STRING_INFO
-#undef APPEND_PLATFORM_STRING_INFO
-#undef APPEND_DEVICE_STRING_INFO
-
-	return result;
+  if (OpenCLInfo::device_type() == 0) {
+    return "All OpenCL devices are forced to be OFF";
+  }
+  string result = "";
+  string error_msg = ""; /* Only used by opencl_assert(), but in the future
+                           * it could also be nicely reported to the console.
+                           */
+  cl_uint num_platforms = 0;
+  opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
+  if (num_platforms == 0) {
+    return "No OpenCL platforms found\n";
+  }
+  result += string_printf("Number of platforms: %u\n", num_platforms);
+
+  vector<cl_platform_id> platform_ids;
+  platform_ids.resize(num_platforms);
+  opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
+
+  typedef char cl_string[1024];
+
+#  define APPEND_INFO(func, id, name, what, type) \
+    do { \
+      type data; \
+      memset(&data, 0, sizeof(data)); \
+      opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
+      result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
+    } while (false)
+#  define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
+    do { \
+      char data[1024] = "\0"; \
+      size_t length = 0; \
+      if (func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \
+        if (length != 0 && data[0] != '\0') { \
+          result += string_printf("%s: %s\n", name, data); \
+        } \
+      } \
+    } while (false)
+#  define APPEND_PLATFORM_INFO(id, name, what, type) \
+    APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
+#  define APPEND_DEVICE_INFO(id, name, what, type) \
+    APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
+#  define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
+    APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
+
+  vector<cl_device_id> device_ids;
+  for (cl_uint platform = 0; platform < num_platforms; ++platform) {
+    cl_platform_id platform_id = platform_ids[platform];
+
+    result += string_printf("Platform #%u\n", platform);
+
+    APPEND_PLATFORM_INFO(platform_id, "Name", CL_PLATFORM_NAME, cl_string);
+    APPEND_PLATFORM_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR, cl_string);
+    APPEND_PLATFORM_INFO(platform_id, "Version", CL_PLATFORM_VERSION, cl_string);
+    APPEND_PLATFORM_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE, cl_string);
+    APPEND_PLATFORM_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS, cl_string);
+
+    cl_uint num_devices = 0;
+    opencl_assert(
+        clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices));
+    result += string_printf("\tNumber of devices: %u\n", num_devices);
+
+    device_ids.resize(num_devices);
+    opencl_assert(clGetDeviceIDs(
+        platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL));
+    for (cl_uint device = 0; device < num_devices; ++device) {
+      cl_device_id device_id = device_ids[device];
+
+      result += string_printf("\t\tDevice: #%u\n", device);
+
+      APPEND_DEVICE_INFO(device_id, "Name", CL_DEVICE_NAME, cl_string);
+      APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
+      APPEND_DEVICE_INFO(device_id, "Vendor", CL_DEVICE_VENDOR, cl_string);
+      APPEND_DEVICE_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION, cl_string);
+      APPEND_DEVICE_INFO(device_id, "Profile", CL_DEVICE_PROFILE, cl_string);
+      APPEND_DEVICE_INFO(device_id, "Version", CL_DEVICE_VERSION, cl_string);
+      APPEND_DEVICE_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS, cl_string);
+      APPEND_DEVICE_INFO(
+          device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
+      APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
+      APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
+    }
+  }
+
+#  undef APPEND_STRING_INFO
+#  undef APPEND_PLATFORM_STRING_INFO
+#  undef APPEND_DEVICE_STRING_INFO
+
+  return result;
 }
 
 CCL_NAMESPACE_END
 
-#endif  /* WITH_OPENCL */
+#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index ee566e57918..42e597a34d7 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -27,299 +27,304 @@ CCL_NAMESPACE_BEGIN
 static const double alpha = 0.1; /* alpha for rolling average */
 
 DeviceSplitKernel::DeviceSplitKernel(Device *device)
-: device(device),
-  split_data(device, "split_data"),
-  ray_state(device, "ray_state", MEM_READ_WRITE),
-  queue_index(device, "queue_index"),
-  use_queues_flag(device, "use_queues_flag"),
-  work_pool_wgs(device, "work_pool_wgs"),
-  kernel_data_initialized(false)
+    : device(device),
+      split_data(device, "split_data"),
+      ray_state(device, "ray_state", MEM_READ_WRITE),
+      queue_index(device, "queue_index"),
+      use_queues_flag(device, "use_queues_flag"),
+      work_pool_wgs(device, "work_pool_wgs"),
+      kernel_data_initialized(false)
 {
-	avg_time_per_sample = 0.0;
-
-	kernel_path_init = NULL;
-	kernel_scene_intersect = NULL;
-	kernel_lamp_emission = NULL;
-	kernel_do_volume = NULL;
-	kernel_queue_enqueue = NULL;
-	kernel_indirect_background = NULL;
-	kernel_shader_setup = NULL;
-	kernel_shader_sort = NULL;
-	kernel_shader_eval = NULL;
-	kernel_holdout_emission_blurring_pathtermination_ao = NULL;
-	kernel_subsurface_scatter = NULL;
-	kernel_direct_lighting = NULL;
-	kernel_shadow_blocked_ao = NULL;
-	kernel_shadow_blocked_dl = NULL;
-	kernel_enqueue_inactive = NULL;
-	kernel_next_iteration_setup = NULL;
-	kernel_indirect_subsurface = NULL;
-	kernel_buffer_update = NULL;
+  avg_time_per_sample = 0.0;
+
+  kernel_path_init = NULL;
+  kernel_scene_intersect = NULL;
+  kernel_lamp_emission = NULL;
+  kernel_do_volume = NULL;
+  kernel_queue_enqueue = NULL;
+  kernel_indirect_background = NULL;
+  kernel_shader_setup = NULL;
+  kernel_shader_sort = NULL;
+  kernel_shader_eval = NULL;
+  kernel_holdout_emission_blurring_pathtermination_ao = NULL;
+  kernel_subsurface_scatter = NULL;
+  kernel_direct_lighting = NULL;
+  kernel_shadow_blocked_ao = NULL;
+  kernel_shadow_blocked_dl = NULL;
+  kernel_enqueue_inactive = NULL;
+  kernel_next_iteration_setup = NULL;
+  kernel_indirect_subsurface = NULL;
+  kernel_buffer_update = NULL;
 }
 
 DeviceSplitKernel::~DeviceSplitKernel()
 {
-	split_data.free();
-	ray_state.free();
-	use_queues_flag.free();
-	queue_index.free();
-	work_pool_wgs.free();
-
-	delete kernel_path_init;
-	delete kernel_scene_intersect;
-	delete kernel_lamp_emission;
-	delete kernel_do_volume;
-	delete kernel_queue_enqueue;
-	delete kernel_indirect_background;
-	delete kernel_shader_setup;
-	delete kernel_shader_sort;
-	delete kernel_shader_eval;
-	delete kernel_holdout_emission_blurring_pathtermination_ao;
-	delete kernel_subsurface_scatter;
-	delete kernel_direct_lighting;
-	delete kernel_shadow_blocked_ao;
-	delete kernel_shadow_blocked_dl;
-	delete kernel_enqueue_inactive;
-	delete kernel_next_iteration_setup;
-	delete kernel_indirect_subsurface;
-	delete kernel_buffer_update;
+  split_data.free();
+  ray_state.free();
+  use_queues_flag.free();
+  queue_index.free();
+  work_pool_wgs.free();
+
+  delete kernel_path_init;
+  delete kernel_scene_intersect;
+  delete kernel_lamp_emission;
+  delete kernel_do_volume;
+  delete kernel_queue_enqueue;
+  delete kernel_indirect_background;
+  delete kernel_shader_setup;
+  delete kernel_shader_sort;
+  delete kernel_shader_eval;
+  delete kernel_holdout_emission_blurring_pathtermination_ao;
+  delete kernel_subsurface_scatter;
+  delete kernel_direct_lighting;
+  delete kernel_shadow_blocked_ao;
+  delete kernel_shadow_blocked_dl;
+  delete kernel_enqueue_inactive;
+  delete kernel_next_iteration_setup;
+  delete kernel_indirect_subsurface;
+  delete kernel_buffer_update;
 }
 
-bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
+bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
 {
 #define LOAD_KERNEL(name) \
-		kernel_##name = get_split_kernel_function(#name, requested_features); \
-		if(!kernel_##name) { \
-			device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
-			return false; \
-		}
-
-	LOAD_KERNEL(path_init);
-	LOAD_KERNEL(scene_intersect);
-	LOAD_KERNEL(lamp_emission);
-	if (requested_features.use_volume) {
-		LOAD_KERNEL(do_volume);
-	}
-	LOAD_KERNEL(queue_enqueue);
-	LOAD_KERNEL(indirect_background);
-	LOAD_KERNEL(shader_setup);
-	LOAD_KERNEL(shader_sort);
-	LOAD_KERNEL(shader_eval);
-	LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-	LOAD_KERNEL(subsurface_scatter);
-	LOAD_KERNEL(direct_lighting);
-	LOAD_KERNEL(shadow_blocked_ao);
-	LOAD_KERNEL(shadow_blocked_dl);
-	LOAD_KERNEL(enqueue_inactive);
-	LOAD_KERNEL(next_iteration_setup);
-	LOAD_KERNEL(indirect_subsurface);
-	LOAD_KERNEL(buffer_update);
+  kernel_##name = get_split_kernel_function(#name, requested_features); \
+  if (!kernel_##name) { \
+    device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
+    return false; \
+  }
+
+  LOAD_KERNEL(path_init);
+  LOAD_KERNEL(scene_intersect);
+  LOAD_KERNEL(lamp_emission);
+  if (requested_features.use_volume) {
+    LOAD_KERNEL(do_volume);
+  }
+  LOAD_KERNEL(queue_enqueue);
+  LOAD_KERNEL(indirect_background);
+  LOAD_KERNEL(shader_setup);
+  LOAD_KERNEL(shader_sort);
+  LOAD_KERNEL(shader_eval);
+  LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+  LOAD_KERNEL(subsurface_scatter);
+  LOAD_KERNEL(direct_lighting);
+  LOAD_KERNEL(shadow_blocked_ao);
+  LOAD_KERNEL(shadow_blocked_dl);
+  LOAD_KERNEL(enqueue_inactive);
+  LOAD_KERNEL(next_iteration_setup);
+  LOAD_KERNEL(indirect_subsurface);
+  LOAD_KERNEL(buffer_update);
 
 #undef LOAD_KERNEL
 
-	/* Re-initialiaze kernel-dependent data when kernels change. */
-	kernel_data_initialized = false;
+  /* Re-initialiaze kernel-dependent data when kernels change. */
+  kernel_data_initialized = false;
 
-	return true;
+  return true;
 }
 
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
+                                                           device_memory &data,
+                                                           uint64_t max_buffer_size)
 {
-	uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
-	VLOG(1) << "Split state element size: "
-	        << string_human_readable_number(size_per_element) << " bytes. ("
-	        << string_human_readable_size(size_per_element) << ").";
-	return max_buffer_size / size_per_element;
+  uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+  VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element)
+          << " bytes. (" << string_human_readable_size(size_per_element) << ").";
+  return max_buffer_size / size_per_element;
 }
 
 bool DeviceSplitKernel::path_trace(DeviceTask *task,
-                                   RenderTile& tile,
-                                   device_memory& kgbuffer,
-                                   device_memory& kernel_data)
+                                   RenderTile &tile,
+                                   device_memory &kgbuffer,
+                                   device_memory &kernel_data)
 {
-	if(device->have_error()) {
-		return false;
-	}
+  if (device->have_error()) {
+    return false;
+  }
 
-	/* Allocate all required global memory once. */
-	if(!kernel_data_initialized) {
-		kernel_data_initialized = true;
+  /* Allocate all required global memory once. */
+  if (!kernel_data_initialized) {
+    kernel_data_initialized = true;
 
-		/* Set local size */
-		int2 lsize = split_kernel_local_size();
-		local_size[0] = lsize[0];
-		local_size[1] = lsize[1];
+    /* Set local size */
+    int2 lsize = split_kernel_local_size();
+    local_size[0] = lsize[0];
+    local_size[1] = lsize[1];
 
-		/* Set global size */
-		int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
+    /* Set global size */
+    int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
 
-		/* Make sure that set work size is a multiple of local
-		 * work size dimensions.
-		 */
-		global_size[0] = round_up(gsize[0], local_size[0]);
-		global_size[1] = round_up(gsize[1], local_size[1]);
+    /* Make sure that set work size is a multiple of local
+     * work size dimensions.
+     */
+    global_size[0] = round_up(gsize[0], local_size[0]);
+    global_size[1] = round_up(gsize[1], local_size[1]);
 
-		int num_global_elements = global_size[0] * global_size[1];
-		assert(num_global_elements % WORK_POOL_SIZE == 0);
+    int num_global_elements = global_size[0] * global_size[1];
+    assert(num_global_elements % WORK_POOL_SIZE == 0);
 
-		/* Calculate max groups */
+    /* Calculate max groups */
 
-		/* Denotes the maximum work groups possible w.r.t. current requested tile size. */
-		unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU : WORK_POOL_SIZE_GPU;
-		unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
+    /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
+    unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU :
+                                                                      WORK_POOL_SIZE_GPU;
+    unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
 
-		/* Allocate work_pool_wgs memory. */
-		work_pool_wgs.alloc_to_device(max_work_groups);
-		queue_index.alloc_to_device(NUM_QUEUES);
-		use_queues_flag.alloc_to_device(1);
-		split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
-		ray_state.alloc(num_global_elements);
-	}
+    /* Allocate work_pool_wgs memory. */
+    work_pool_wgs.alloc_to_device(max_work_groups);
+    queue_index.alloc_to_device(NUM_QUEUES);
+    use_queues_flag.alloc_to_device(1);
+    split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
+    ray_state.alloc(num_global_elements);
+  }
 
-	/* Number of elements in the global state buffer */
-	int num_global_elements = global_size[0] * global_size[1];
+  /* Number of elements in the global state buffer */
+  int num_global_elements = global_size[0] * global_size[1];
 
 #define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
-		if(device->have_error()) { \
-			return false; \
-		} \
-		if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
-			return false; \
-		}
-
-	tile.sample = tile.start_sample;
-
-	/* for exponential increase between tile updates */
-	int time_multiplier = 1;
-
-	while(tile.sample < tile.start_sample + tile.num_samples) {
-		/* to keep track of how long it takes to run a number of samples */
-		double start_time = time_dt();
-
-		/* initial guess to start rolling average */
-		const int initial_num_samples = 1;
-		/* approx number of samples per second */
-		int samples_per_second = (avg_time_per_sample > 0.0) ?
-		                         int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
-
-		RenderTile subtile = tile;
-		subtile.start_sample = tile.sample;
-		subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
-
-		if(device->have_error()) {
-			return false;
-		}
-
-		/* reset state memory here as global size for data_init
-		 * kernel might not be large enough to do in kernel
-		 */
-		work_pool_wgs.zero_to_device();
-		split_data.zero_to_device();
-		ray_state.zero_to_device();
-
-		if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-		                                   subtile,
-		                                   num_global_elements,
-		                                   kgbuffer,
-		                                   kernel_data,
-		                                   split_data,
-		                                   ray_state,
-		                                   queue_index,
-		                                   use_queues_flag,
-		                                   work_pool_wgs))
-		{
-			return false;
-		}
-
-		ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
-
-		bool activeRaysAvailable = true;
-		double cancel_time = DBL_MAX;
-
-		while(activeRaysAvailable) {
-			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
-			for(int PathIter = 0; PathIter < 16; PathIter++) {
-				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-				if (kernel_do_volume) {
-					ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
-				}
-				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
-
-				if(task->get_cancel() && cancel_time == DBL_MAX) {
-					/* Wait up to twice as many seconds for current samples to finish
-					 * to avoid artifacts in render result from ending too soon.
-					 */
-					cancel_time = time_dt() + 2.0 * time_multiplier;
-				}
-
-				if(time_dt() > cancel_time) {
-					return true;
-				}
-			}
-
-			/* Decide if we should exit path-iteration in host. */
-			ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
-
-			activeRaysAvailable = false;
-
-			for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
-				if(!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
-					if(IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
-						/* Something went wrong, abort to avoid looping endlessly. */
-						device->set_error("Split kernel error: invalid ray state");
-						return false;
-					}
-
-					/* Not all rays are RAY_INACTIVE. */
-					activeRaysAvailable = true;
-					break;
-				}
-			}
-
-			if(time_dt() > cancel_time) {
-				return true;
-			}
-		}
-
-		double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
-
-		if(avg_time_per_sample == 0.0) {
-			/* start rolling average */
-			avg_time_per_sample = time_per_sample;
-		}
-		else {
-			avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
-		}
+  if (device->have_error()) { \
+    return false; \
+  } \
+  if (!kernel_##name->enqueue( \
+          KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
+    return false; \
+  }
+
+  tile.sample = tile.start_sample;
+
+  /* for exponential increase between tile updates */
+  int time_multiplier = 1;
+
+  while (tile.sample < tile.start_sample + tile.num_samples) {
+    /* to keep track of how long it takes to run a number of samples */
+    double start_time = time_dt();
+
+    /* initial guess to start rolling average */
+    const int initial_num_samples = 1;
+    /* approx number of samples per second */
+    int samples_per_second = (avg_time_per_sample > 0.0) ?
+                                 int(double(time_multiplier) / avg_time_per_sample) + 1 :
+                                 initial_num_samples;
+
+    RenderTile subtile = tile;
+    subtile.start_sample = tile.sample;
+    subtile.num_samples = min(samples_per_second,
+                              tile.start_sample + tile.num_samples - tile.sample);
+
+    if (device->have_error()) {
+      return false;
+    }
+
+    /* reset state memory here as global size for data_init
+     * kernel might not be large enough to do in kernel
+     */
+    work_pool_wgs.zero_to_device();
+    split_data.zero_to_device();
+    ray_state.zero_to_device();
+
+    if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+                                        subtile,
+                                        num_global_elements,
+                                        kgbuffer,
+                                        kernel_data,
+                                        split_data,
+                                        ray_state,
+                                        queue_index,
+                                        use_queues_flag,
+                                        work_pool_wgs)) {
+      return false;
+    }
+
+    ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
+
+    bool activeRaysAvailable = true;
+    double cancel_time = DBL_MAX;
+
+    while (activeRaysAvailable) {
+      /* Do path-iteration in host [Enqueue Path-iteration kernels. */
+      for (int PathIter = 0; PathIter < 16; PathIter++) {
+        ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+        if (kernel_do_volume) {
+          ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
+        }
+        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(
+            holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+        ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
+
+        if (task->get_cancel() && cancel_time == DBL_MAX) {
+          /* Wait up to twice as many seconds for current samples to finish
+           * to avoid artifacts in render result from ending too soon.
+           */
+          cancel_time = time_dt() + 2.0 * time_multiplier;
+        }
+
+        if (time_dt() > cancel_time) {
+          return true;
+        }
+      }
+
+      /* Decide if we should exit path-iteration in host. */
+      ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
+
+      activeRaysAvailable = false;
+
+      for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
+        if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
+          if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
+            /* Something went wrong, abort to avoid looping endlessly. */
+            device->set_error("Split kernel error: invalid ray state");
+            return false;
+          }
+
+          /* Not all rays are RAY_INACTIVE. */
+          activeRaysAvailable = true;
+          break;
+        }
+      }
+
+      if (time_dt() > cancel_time) {
+        return true;
+      }
+    }
+
+    double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
+
+    if (avg_time_per_sample == 0.0) {
+      /* start rolling average */
+      avg_time_per_sample = time_per_sample;
+    }
+    else {
+      avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample;
+    }
 
 #undef ENQUEUE_SPLIT_KERNEL
 
-		tile.sample += subtile.num_samples;
-		task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
+    tile.sample += subtile.num_samples;
+    task->update_progress(&tile, tile.w * tile.h * subtile.num_samples);
 
-		time_multiplier = min(time_multiplier << 1, 10);
+    time_multiplier = min(time_multiplier << 1, 10);
 
-		if(task->get_cancel()) {
-			return true;
-		}
-	}
+    if (task->get_cancel()) {
+      return true;
+    }
+  }
 
-	return true;
+  return true;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 622733b843f..c9fb2ac844f 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -27,106 +27,115 @@ CCL_NAMESPACE_BEGIN
  * Since some bytes may be needed for aligning chunks of memory;
  * This is the amount of memory that we dedicate for that purpose.
  */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+#define DATA_ALLOCATION_MEM_FACTOR 5000000  //5MB
 
 /* Types used for split kernel */
 
 class KernelDimensions {
-public:
-	size_t global_size[2];
-	size_t local_size[2];
-
-	KernelDimensions(size_t global_size_[2], size_t local_size_[2])
-	{
-		memcpy(global_size, global_size_, sizeof(global_size));
-		memcpy(local_size, local_size_, sizeof(local_size));
-	}
+ public:
+  size_t global_size[2];
+  size_t local_size[2];
+
+  KernelDimensions(size_t global_size_[2], size_t local_size_[2])
+  {
+    memcpy(global_size, global_size_, sizeof(global_size));
+    memcpy(local_size, local_size_, sizeof(local_size));
+  }
 };
 
 class SplitKernelFunction {
-public:
-	virtual ~SplitKernelFunction() {}
+ public:
+  virtual ~SplitKernelFunction()
+  {
+  }
 
-	/* enqueue the kernel, returns false if there is an error */
-	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0;
+  /* enqueue the kernel, returns false if there is an error */
+  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0;
 };
 
 class DeviceSplitKernel {
-private:
-	Device *device;
-
-	SplitKernelFunction *kernel_path_init;
-	SplitKernelFunction *kernel_scene_intersect;
-	SplitKernelFunction *kernel_lamp_emission;
-	SplitKernelFunction *kernel_do_volume;
-	SplitKernelFunction *kernel_queue_enqueue;
-	SplitKernelFunction *kernel_indirect_background;
-	SplitKernelFunction *kernel_shader_setup;
-	SplitKernelFunction *kernel_shader_sort;
-	SplitKernelFunction *kernel_shader_eval;
-	SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
-	SplitKernelFunction *kernel_subsurface_scatter;
-	SplitKernelFunction *kernel_direct_lighting;
-	SplitKernelFunction *kernel_shadow_blocked_ao;
-	SplitKernelFunction *kernel_shadow_blocked_dl;
-	SplitKernelFunction *kernel_enqueue_inactive;
-	SplitKernelFunction *kernel_next_iteration_setup;
-	SplitKernelFunction *kernel_indirect_subsurface;
-	SplitKernelFunction *kernel_buffer_update;
-
-	/* Global memory variables [porting]; These memory is used for
-	 * co-operation between different kernels; Data written by one
-	 * kernel will be available to another kernel via this global
-	 * memory.
-	 */
-	device_only_memory<uchar> split_data;
-	device_vector<uchar> ray_state;
-	device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
-
-	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	device_only_memory<char> use_queues_flag;
-
-	/* Approximate time it takes to complete one sample */
-	double avg_time_per_sample;
-
-	/* Work pool with respect to each work group. */
-	device_only_memory<unsigned int> work_pool_wgs;
-
-	/* Cached kernel-dependent data, initialized once. */
-	bool kernel_data_initialized;
-	size_t local_size[2];
-	size_t global_size[2];
-
-public:
-	explicit DeviceSplitKernel(Device* device);
-	virtual ~DeviceSplitKernel();
-
-	bool load_kernels(const DeviceRequestedFeatures& requested_features);
-	bool path_trace(DeviceTask *task,
-	                RenderTile& rtile,
-	                device_memory& kgbuffer,
-	                device_memory& kernel_data);
-
-	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
-	size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size);
-
-	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
-	                                            RenderTile& rtile,
-	                                            int num_global_elements,
-	                                            device_memory& kernel_globals,
-	                                            device_memory& kernel_data_,
-	                                            device_memory& split_data,
-	                                            device_memory& ray_state,
-	                                            device_memory& queue_index,
-	                                            device_memory& use_queues_flag,
-	                                            device_memory& work_pool_wgs) = 0;
-
-	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
-	                                                       const DeviceRequestedFeatures&) = 0;
-	virtual int2 split_kernel_local_size() = 0;
-	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
+ private:
+  Device *device;
+
+  SplitKernelFunction *kernel_path_init;
+  SplitKernelFunction *kernel_scene_intersect;
+  SplitKernelFunction *kernel_lamp_emission;
+  SplitKernelFunction *kernel_do_volume;
+  SplitKernelFunction *kernel_queue_enqueue;
+  SplitKernelFunction *kernel_indirect_background;
+  SplitKernelFunction *kernel_shader_setup;
+  SplitKernelFunction *kernel_shader_sort;
+  SplitKernelFunction *kernel_shader_eval;
+  SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
+  SplitKernelFunction *kernel_subsurface_scatter;
+  SplitKernelFunction *kernel_direct_lighting;
+  SplitKernelFunction *kernel_shadow_blocked_ao;
+  SplitKernelFunction *kernel_shadow_blocked_dl;
+  SplitKernelFunction *kernel_enqueue_inactive;
+  SplitKernelFunction *kernel_next_iteration_setup;
+  SplitKernelFunction *kernel_indirect_subsurface;
+  SplitKernelFunction *kernel_buffer_update;
+
+  /* Global memory variables [porting]; These memory is used for
+   * co-operation between different kernels; Data written by one
+   * kernel will be available to another kernel via this global
+   * memory.
+   */
+  device_only_memory<uchar> split_data;
+  device_vector<uchar> ray_state;
+  device_only_memory<int>
+      queue_index; /* Array of size num_queues that tracks the size of each queue. */
+
+  /* Flag to make sceneintersect and lampemission kernel use queues. */
+  device_only_memory<char> use_queues_flag;
+
+  /* Approximate time it takes to complete one sample */
+  double avg_time_per_sample;
+
+  /* Work pool with respect to each work group. */
+  device_only_memory<unsigned int> work_pool_wgs;
+
+  /* Cached kernel-dependent data, initialized once. */
+  bool kernel_data_initialized;
+  size_t local_size[2];
+  size_t global_size[2];
+
+ public:
+  explicit DeviceSplitKernel(Device *device);
+  virtual ~DeviceSplitKernel();
+
+  bool load_kernels(const DeviceRequestedFeatures &requested_features);
+  bool path_trace(DeviceTask *task,
+                  RenderTile &rtile,
+                  device_memory &kgbuffer,
+                  device_memory &kernel_data);
+
+  virtual uint64_t state_buffer_size(device_memory &kg,
+                                     device_memory &data,
+                                     size_t num_threads) = 0;
+  size_t max_elements_for_max_buffer_size(device_memory &kg,
+                                          device_memory &data,
+                                          uint64_t max_buffer_size);
+
+  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                              RenderTile &rtile,
+                                              int num_global_elements,
+                                              device_memory &kernel_globals,
+                                              device_memory &kernel_data_,
+                                              device_memory &split_data,
+                                              device_memory &ray_state,
+                                              device_memory &queue_index,
+                                              device_memory &use_queues_flag,
+                                              device_memory &work_pool_wgs) = 0;
+
+  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
+                                                         const DeviceRequestedFeatures &) = 0;
+  virtual int2 split_kernel_local_size() = 0;
+  virtual int2 split_kernel_global_size(device_memory &kg,
+                                        device_memory &data,
+                                        DeviceTask *task) = 0;
 };
 
 CCL_NAMESPACE_END
 
-#endif  /* __DEVICE_SPLIT_KERNEL_H__ */
+#endif /* __DEVICE_SPLIT_KERNEL_H__ */
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 8310863886c..376ad06a734 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -29,100 +29,111 @@ CCL_NAMESPACE_BEGIN
 /* Device Task */
 
 DeviceTask::DeviceTask(Type type_)
-: type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
-  sample(0), num_samples(1),
-  shader_input(0), shader_output(0),
-  shader_eval_type(0), shader_filter(0), shader_x(0), shader_w(0)
+    : type(type_),
+      x(0),
+      y(0),
+      w(0),
+      h(0),
+      rgba_byte(0),
+      rgba_half(0),
+      buffer(0),
+      sample(0),
+      num_samples(1),
+      shader_input(0),
+      shader_output(0),
+      shader_eval_type(0),
+      shader_filter(0),
+      shader_x(0),
+      shader_w(0)
 {
-	last_update_time = time_dt();
+  last_update_time = time_dt();
 }
 
 int DeviceTask::get_subtask_count(int num, int max_size)
 {
-	if(max_size != 0) {
-		int max_size_num;
-
-		if(type == SHADER) {
-			max_size_num = (shader_w + max_size - 1)/max_size;
-		}
-		else {
-			max_size = max(1, max_size/w);
-			max_size_num = (h + max_size - 1)/max_size;
-		}
-
-		num = max(max_size_num, num);
-	}
-
-	if(type == SHADER) {
-		num = min(shader_w, num);
-	}
-	else if(type == RENDER) {
-	}
-	else {
-		num = min(h, num);
-	}
-
-	return num;
+  if (max_size != 0) {
+    int max_size_num;
+
+    if (type == SHADER) {
+      max_size_num = (shader_w + max_size - 1) / max_size;
+    }
+    else {
+      max_size = max(1, max_size / w);
+      max_size_num = (h + max_size - 1) / max_size;
+    }
+
+    num = max(max_size_num, num);
+  }
+
+  if (type == SHADER) {
+    num = min(shader_w, num);
+  }
+  else if (type == RENDER) {
+  }
+  else {
+    num = min(h, num);
+  }
+
+  return num;
 }
 
-void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
+void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size)
 {
-	num = get_subtask_count(num, max_size);
-
-	if(type == SHADER) {
-		for(int i = 0; i < num; i++) {
-			int tx = shader_x + (shader_w/num)*i;
-			int tw = (i == num-1)? shader_w - i*(shader_w/num): shader_w/num;
-
-			DeviceTask task = *this;
-
-			task.shader_x = tx;
-			task.shader_w = tw;
-
-			tasks.push_back(task);
-		}
-	}
-	else if(type == RENDER) {
-		for(int i = 0; i < num; i++)
-			tasks.push_back(*this);
-	}
-	else {
-		for(int i = 0; i < num; i++) {
-			int ty = y + (h/num)*i;
-			int th = (i == num-1)? h - i*(h/num): h/num;
-
-			DeviceTask task = *this;
-
-			task.y = ty;
-			task.h = th;
-
-			tasks.push_back(task);
-		}
-	}
+  num = get_subtask_count(num, max_size);
+
+  if (type == SHADER) {
+    for (int i = 0; i < num; i++) {
+      int tx = shader_x + (shader_w / num) * i;
+      int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num;
+
+      DeviceTask task = *this;
+
+      task.shader_x = tx;
+      task.shader_w = tw;
+
+      tasks.push_back(task);
+    }
+  }
+  else if (type == RENDER) {
+    for (int i = 0; i < num; i++)
+      tasks.push_back(*this);
+  }
+  else {
+    for (int i = 0; i < num; i++) {
+      int ty = y + (h / num) * i;
+      int th = (i == num - 1) ? h - i * (h / num) : h / num;
+
+      DeviceTask task = *this;
+
+      task.y = ty;
+      task.h = th;
+
+      tasks.push_back(task);
+    }
+  }
 }
 
 void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
 {
-	if((type != RENDER) &&
-	   (type != SHADER))
-		return;
-
-	if(update_progress_sample) {
-		if(pixel_samples == -1) {
-			pixel_samples = shader_w;
-		}
-		update_progress_sample(pixel_samples, rtile? rtile->sample : 0);
-	}
-
-	if(update_tile_sample) {
-		double current_time = time_dt();
-
-		if(current_time - last_update_time >= 1.0) {
-			update_tile_sample(*rtile);
-
-			last_update_time = current_time;
-		}
-	}
+  if ((type != RENDER) && (type != SHADER))
+    return;
+
+  if (update_progress_sample) {
+    if (pixel_samples == -1) {
+      pixel_samples = shader_w;
+    }
+    update_progress_sample(pixel_samples, rtile ? rtile->sample : 0);
+  }
+
+  if (update_tile_sample) {
+    double current_time = time_dt();
+
+    if (current_time - last_update_time >= 1.0) {
+      update_tile_sample(*rtile);
+
+      last_update_time = current_time;
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index f1fd4246868..5cc2e5e25db 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -33,87 +33,88 @@ class RenderTile;
 class Tile;
 
 class DenoiseParams {
-public:
-	/* Pixel radius for neighbouring pixels to take into account. */
-	int radius;
-	/* Controls neighbor pixel weighting for the denoising filter. */
-	float strength;
-	/* Preserve more or less detail based on feature passes. */
-	float feature_strength;
-	/* When removing pixels that don't carry information, use a relative threshold instead of an absolute one. */
-	bool relative_pca;
-	/* How many frames before and after the current center frame are included. */
-	int neighbor_frames;
-	/* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
-	bool clamp_input;
-
-	DenoiseParams()
-	{
-		radius = 8;
-		strength = 0.5f;
-		feature_strength = 0.5f;
-		relative_pca = false;
-		neighbor_frames = 2;
-		clamp_input = true;
-	}
+ public:
+  /* Pixel radius for neighbouring pixels to take into account. */
+  int radius;
+  /* Controls neighbor pixel weighting for the denoising filter. */
+  float strength;
+  /* Preserve more or less detail based on feature passes. */
+  float feature_strength;
+  /* When removing pixels that don't carry information, use a relative threshold instead of an absolute one. */
+  bool relative_pca;
+  /* How many frames before and after the current center frame are included. */
+  int neighbor_frames;
+  /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
+  bool clamp_input;
+
+  DenoiseParams()
+  {
+    radius = 8;
+    strength = 0.5f;
+    feature_strength = 0.5f;
+    relative_pca = false;
+    neighbor_frames = 2;
+    clamp_input = true;
+  }
 };
 
 class DeviceTask : public Task {
-public:
-	typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
-	Type type;
-
-	int x, y, w, h;
-	device_ptr rgba_byte;
-	device_ptr rgba_half;
-	device_ptr buffer;
-	int sample;
-	int num_samples;
-	int offset, stride;
-
-	device_ptr shader_input;
-	device_ptr shader_output;
-	int shader_eval_type;
-	int shader_filter;
-	int shader_x, shader_w;
-
-	int passes_size;
-
-	explicit DeviceTask(Type type = RENDER);
-
-	int get_subtask_count(int num, int max_size = 0);
-	void split(list<DeviceTask>& tasks, int num, int max_size = 0);
-
-	void update_progress(RenderTile *rtile, int pixel_samples = -1);
-
-	function<bool(Device *device, RenderTile&)> acquire_tile;
-	function<void(long, int)> update_progress_sample;
-	function<void(RenderTile&)> update_tile_sample;
-	function<void(RenderTile&)> release_tile;
-	function<bool()> get_cancel;
-	function<void(RenderTile*, Device*)> map_neighbor_tiles;
-	function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
-
-	DenoiseParams denoising;
-	bool denoising_from_render;
-	vector<int> denoising_frames;
-
-	bool denoising_do_filter;
-	bool denoising_write_passes;
-
-	int pass_stride;
-	int frame_stride;
-	int target_pass_stride;
-	int pass_denoising_data;
-	int pass_denoising_clean;
-
-	bool need_finish_queue;
-	bool integrator_branched;
-	int2 requested_tile_size;
-protected:
-	double last_update_time;
+ public:
+  typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
+  Type type;
+
+  int x, y, w, h;
+  device_ptr rgba_byte;
+  device_ptr rgba_half;
+  device_ptr buffer;
+  int sample;
+  int num_samples;
+  int offset, stride;
+
+  device_ptr shader_input;
+  device_ptr shader_output;
+  int shader_eval_type;
+  int shader_filter;
+  int shader_x, shader_w;
+
+  int passes_size;
+
+  explicit DeviceTask(Type type = RENDER);
+
+  int get_subtask_count(int num, int max_size = 0);
+  void split(list<DeviceTask> &tasks, int num, int max_size = 0);
+
+  void update_progress(RenderTile *rtile, int pixel_samples = -1);
+
+  function<bool(Device *device, RenderTile &)> acquire_tile;
+  function<void(long, int)> update_progress_sample;
+  function<void(RenderTile &)> update_tile_sample;
+  function<void(RenderTile &)> release_tile;
+  function<bool()> get_cancel;
+  function<void(RenderTile *, Device *)> map_neighbor_tiles;
+  function<void(RenderTile *, Device *)> unmap_neighbor_tiles;
+
+  DenoiseParams denoising;
+  bool denoising_from_render;
+  vector<int> denoising_frames;
+
+  bool denoising_do_filter;
+  bool denoising_write_passes;
+
+  int pass_stride;
+  int frame_stride;
+  int target_pass_stride;
+  int pass_denoising_data;
+  int pass_denoising_clean;
+
+  bool need_finish_queue;
+  bool integrator_branched;
+  int2 requested_tile_size;
+
+ protected:
+  double last_update_time;
 };
 
 CCL_NAMESPACE_END
 
-#endif  /* __DEVICE_TASK_H__ */
+#endif /* __DEVICE_TASK_H__ */
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
index 9cb105982aa..f85aadce1c2 100644
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ b/intern/cycles/device/opencl/memory_manager.cpp
@@ -16,241 +16,246 @@
 
 #ifdef WITH_OPENCL
 
-#include "util/util_foreach.h"
+#  include "util/util_foreach.h"
 
-#include "device/opencl/opencl.h"
-#include "device/opencl/memory_manager.h"
+#  include "device/opencl/opencl.h"
+#  include "device/opencl/memory_manager.h"
 
 CCL_NAMESPACE_BEGIN
 
-void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation)
+void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation)
 {
-	allocations.push_back(&allocation);
+  allocations.push_back(&allocation);
 }
 
 void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device)
 {
-	bool need_realloc = false;
-
-	/* Calculate total size and remove any freed. */
-	size_t total_size = 0;
-
-	for(int i = allocations.size()-1; i >= 0; i--) {
-		Allocation* allocation = allocations[i];
-
-		/* Remove allocations that have been freed. */
-		if(!allocation->mem || allocation->mem->memory_size() == 0) {
-			allocation->device_buffer = NULL;
-			allocation->size = 0;
-
-			allocations.erase(allocations.begin()+i);
-
-			need_realloc = true;
-
-			continue;
-		}
-
-		/* Get actual size for allocation. */
-		size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
-
-		if(allocation->size != alloc_size) {
-			/* Allocation is either new or resized. */
-			allocation->size = alloc_size;
-			allocation->needs_copy_to_device = true;
-
-			need_realloc = true;
-		}
-
-		total_size += alloc_size;
-	}
-
-	if(need_realloc) {
-		cl_ulong max_buffer_size;
-		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-		if(total_size > max_buffer_size) {
-			device->set_error("Scene too complex to fit in available memory.");
-			return;
-		}
-
-		device_only_memory<uchar> *new_buffer =
-			new device_only_memory<uchar>(device, "memory manager buffer");
-
-		new_buffer->alloc_to_device(total_size);
-
-		size_t offset = 0;
-
-		foreach(Allocation* allocation, allocations) {
-			if(allocation->needs_copy_to_device) {
-				/* Copy from host to device. */
-				opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
-					CL_MEM_PTR(new_buffer->device_pointer),
-					CL_FALSE,
-					offset,
-					allocation->mem->memory_size(),
-					allocation->mem->host_pointer,
-					0, NULL, NULL
-				));
-
-				allocation->needs_copy_to_device = false;
-			}
-			else {
-				/* Fast copy from memory already on device. */
-				opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue,
-					CL_MEM_PTR(buffer->device_pointer),
-					CL_MEM_PTR(new_buffer->device_pointer),
-					allocation->desc.offset,
-					offset,
-					allocation->mem->memory_size(),
-					0, NULL, NULL
-				));
-			}
-
-			allocation->desc.offset = offset;
-			offset += allocation->size;
-		}
-
-		delete buffer;
-
-		buffer = new_buffer;
-	}
-	else {
-		assert(total_size == buffer->data_size);
-
-		size_t offset = 0;
-
-		foreach(Allocation* allocation, allocations) {
-			if(allocation->needs_copy_to_device) {
-				/* Copy from host to device. */
-				opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
-					CL_MEM_PTR(buffer->device_pointer),
-					CL_FALSE,
-					offset,
-					allocation->mem->memory_size(),
-					allocation->mem->host_pointer,
-					0, NULL, NULL
-				));
-
-				allocation->needs_copy_to_device = false;
-			}
-
-			offset += allocation->size;
-		}
-	}
-
-	/* Not really necessary, but seems to improve responsiveness for some reason. */
-	clFinish(device->cqCommandQueue);
+  bool need_realloc = false;
+
+  /* Calculate total size and remove any freed. */
+  size_t total_size = 0;
+
+  for (int i = allocations.size() - 1; i >= 0; i--) {
+    Allocation *allocation = allocations[i];
+
+    /* Remove allocations that have been freed. */
+    if (!allocation->mem || allocation->mem->memory_size() == 0) {
+      allocation->device_buffer = NULL;
+      allocation->size = 0;
+
+      allocations.erase(allocations.begin() + i);
+
+      need_realloc = true;
+
+      continue;
+    }
+
+    /* Get actual size for allocation. */
+    size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
+
+    if (allocation->size != alloc_size) {
+      /* Allocation is either new or resized. */
+      allocation->size = alloc_size;
+      allocation->needs_copy_to_device = true;
+
+      need_realloc = true;
+    }
+
+    total_size += alloc_size;
+  }
+
+  if (need_realloc) {
+    cl_ulong max_buffer_size;
+    clGetDeviceInfo(
+        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+    if (total_size > max_buffer_size) {
+      device->set_error("Scene too complex to fit in available memory.");
+      return;
+    }
+
+    device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device,
+                                                                          "memory manager buffer");
+
+    new_buffer->alloc_to_device(total_size);
+
+    size_t offset = 0;
+
+    foreach (Allocation *allocation, allocations) {
+      if (allocation->needs_copy_to_device) {
+        /* Copy from host to device. */
+        opencl_device_assert(device,
+                             clEnqueueWriteBuffer(device->cqCommandQueue,
+                                                  CL_MEM_PTR(new_buffer->device_pointer),
+                                                  CL_FALSE,
+                                                  offset,
+                                                  allocation->mem->memory_size(),
+                                                  allocation->mem->host_pointer,
+                                                  0,
+                                                  NULL,
+                                                  NULL));
+
+        allocation->needs_copy_to_device = false;
+      }
+      else {
+        /* Fast copy from memory already on device. */
+        opencl_device_assert(device,
+                             clEnqueueCopyBuffer(device->cqCommandQueue,
+                                                 CL_MEM_PTR(buffer->device_pointer),
+                                                 CL_MEM_PTR(new_buffer->device_pointer),
+                                                 allocation->desc.offset,
+                                                 offset,
+                                                 allocation->mem->memory_size(),
+                                                 0,
+                                                 NULL,
+                                                 NULL));
+      }
+
+      allocation->desc.offset = offset;
+      offset += allocation->size;
+    }
+
+    delete buffer;
+
+    buffer = new_buffer;
+  }
+  else {
+    assert(total_size == buffer->data_size);
+
+    size_t offset = 0;
+
+    foreach (Allocation *allocation, allocations) {
+      if (allocation->needs_copy_to_device) {
+        /* Copy from host to device. */
+        opencl_device_assert(device,
+                             clEnqueueWriteBuffer(device->cqCommandQueue,
+                                                  CL_MEM_PTR(buffer->device_pointer),
+                                                  CL_FALSE,
+                                                  offset,
+                                                  allocation->mem->memory_size(),
+                                                  allocation->mem->host_pointer,
+                                                  0,
+                                                  NULL,
+                                                  NULL));
+
+        allocation->needs_copy_to_device = false;
+      }
+
+      offset += allocation->size;
+    }
+  }
+
+  /* Not really necessary, but seems to improve responsiveness for some reason. */
+  clFinish(device->cqCommandQueue);
 }
 
 void MemoryManager::DeviceBuffer::free(OpenCLDevice *)
 {
-	buffer->free();
+  buffer->free();
 }
 
-MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer()
+MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer()
 {
-	DeviceBuffer* smallest = device_buffers;
+  DeviceBuffer *smallest = device_buffers;
 
-	foreach(DeviceBuffer& device_buffer, device_buffers) {
-		if(device_buffer.size < smallest->size) {
-			smallest = &device_buffer;
-		}
-	}
+  foreach (DeviceBuffer &device_buffer, device_buffers) {
+    if (device_buffer.size < smallest->size) {
+      smallest = &device_buffer;
+    }
+  }
 
-	return smallest;
+  return smallest;
 }
 
-MemoryManager::MemoryManager(OpenCLDevice *device)
-: device(device), need_update(false)
+MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false)
 {
-	foreach(DeviceBuffer& device_buffer, device_buffers) {
-		device_buffer.buffer =
-			new device_only_memory<uchar>(device, "memory manager buffer");
-	}
+  foreach (DeviceBuffer &device_buffer, device_buffers) {
+    device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer");
+  }
 }
 
 void MemoryManager::free()
 {
-	foreach(DeviceBuffer& device_buffer, device_buffers) {
-		device_buffer.free(device);
-	}
+  foreach (DeviceBuffer &device_buffer, device_buffers) {
+    device_buffer.free(device);
+  }
 }
 
-void MemoryManager::alloc(const char *name, device_memory& mem)
+void MemoryManager::alloc(const char *name, device_memory &mem)
 {
-	Allocation& allocation = allocations[name];
+  Allocation &allocation = allocations[name];
 
-	allocation.mem = &mem;
-	allocation.needs_copy_to_device = true;
+  allocation.mem = &mem;
+  allocation.needs_copy_to_device = true;
 
-	if(!allocation.device_buffer) {
-		DeviceBuffer* device_buffer = smallest_device_buffer();
-		allocation.device_buffer = device_buffer;
+  if (!allocation.device_buffer) {
+    DeviceBuffer *device_buffer = smallest_device_buffer();
+    allocation.device_buffer = device_buffer;
 
-		allocation.desc.device_buffer = device_buffer - device_buffers;
+    allocation.desc.device_buffer = device_buffer - device_buffers;
 
-		device_buffer->add_allocation(allocation);
+    device_buffer->add_allocation(allocation);
 
-		device_buffer->size += mem.memory_size();
-	}
+    device_buffer->size += mem.memory_size();
+  }
 
-	need_update = true;
+  need_update = true;
 }
 
-bool MemoryManager::free(device_memory& mem)
+bool MemoryManager::free(device_memory &mem)
 {
-	foreach(AllocationsMap::value_type& value, allocations) {
-		Allocation& allocation = value.second;
-		if(allocation.mem == &mem) {
+  foreach (AllocationsMap::value_type &value, allocations) {
+    Allocation &allocation = value.second;
+    if (allocation.mem == &mem) {
 
-			allocation.device_buffer->size -= mem.memory_size();
+      allocation.device_buffer->size -= mem.memory_size();
 
-			allocation.mem = NULL;
-			allocation.needs_copy_to_device = false;
+      allocation.mem = NULL;
+      allocation.needs_copy_to_device = false;
 
-			need_update = true;
-			return true;
-		}
-	}
+      need_update = true;
+      return true;
+    }
+  }
 
-	return false;
+  return false;
 }
 
 MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
 {
-	update_device_memory();
+  update_device_memory();
 
-	Allocation& allocation = allocations[name];
-	return allocation.desc;
+  Allocation &allocation = allocations[name];
+  return allocation.desc;
 }
 
 void MemoryManager::update_device_memory()
 {
-	if(!need_update) {
-		return;
-	}
+  if (!need_update) {
+    return;
+  }
 
-	need_update = false;
+  need_update = false;
 
-	foreach(DeviceBuffer& device_buffer, device_buffers) {
-		device_buffer.update_device_memory(device);
-	}
+  foreach (DeviceBuffer &device_buffer, device_buffers) {
+    device_buffer.update_device_memory(device);
+  }
 }
 
 void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
 {
-	update_device_memory();
-
-	foreach(DeviceBuffer& device_buffer, device_buffers) {
-		if(device_buffer.buffer->device_pointer) {
-			device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
-		}
-		else {
-			device->kernel_set_args(kernel, (*narg)++, device->null_mem);
-		}
-	}
+  update_device_memory();
+
+  foreach (DeviceBuffer &device_buffer, device_buffers) {
+    if (device_buffer.buffer->device_pointer) {
+      device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
+    }
+    else {
+      device->kernel_set_args(kernel, (*narg)++, device->null_mem);
+    }
+  }
 }
 
 CCL_NAMESPACE_END
 
-#endif  /* WITH_OPENCL */
+#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
index 8fcc4440369..2fbc97a0756 100644
--- a/intern/cycles/device/opencl/memory_manager.h
+++ b/intern/cycles/device/opencl/memory_manager.h
@@ -29,78 +29,77 @@ CCL_NAMESPACE_BEGIN
 class OpenCLDevice;
 
 class MemoryManager {
-public:
-	static const int NUM_DEVICE_BUFFERS = 8;
+ public:
+  static const int NUM_DEVICE_BUFFERS = 8;
 
-	struct BufferDescriptor {
-		uint device_buffer;
-		cl_ulong offset;
-	};
+  struct BufferDescriptor {
+    uint device_buffer;
+    cl_ulong offset;
+  };
 
-private:
-	struct DeviceBuffer;
+ private:
+  struct DeviceBuffer;
 
-	struct Allocation {
-		device_memory *mem;
+  struct Allocation {
+    device_memory *mem;
 
-		DeviceBuffer *device_buffer;
-		size_t size; /* Size of actual allocation, may be larger than requested. */
+    DeviceBuffer *device_buffer;
+    size_t size; /* Size of actual allocation, may be larger than requested. */
 
-		BufferDescriptor desc;
+    BufferDescriptor desc;
 
-		bool needs_copy_to_device;
+    bool needs_copy_to_device;
 
-		Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
-		{
-		}
-	};
+    Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
+    {
+    }
+  };
 
-	struct DeviceBuffer {
-		device_only_memory<uchar> *buffer;
-		vector<Allocation*> allocations;
-		size_t size; /* Size of all allocations. */
+  struct DeviceBuffer {
+    device_only_memory<uchar> *buffer;
+    vector<Allocation *> allocations;
+    size_t size; /* Size of all allocations. */
 
-		DeviceBuffer()
-		: buffer(NULL), size(0)
-		{
-		}
+    DeviceBuffer() : buffer(NULL), size(0)
+    {
+    }
 
-		~DeviceBuffer()
-		{
-			delete buffer;
-			buffer = NULL;
-		}
+    ~DeviceBuffer()
+    {
+      delete buffer;
+      buffer = NULL;
+    }
 
-		void add_allocation(Allocation& allocation);
+    void add_allocation(Allocation &allocation);
 
-		void update_device_memory(OpenCLDevice *device);
+    void update_device_memory(OpenCLDevice *device);
 
-		void free(OpenCLDevice *device);
-	};
+    void free(OpenCLDevice *device);
+  };
 
-	OpenCLDevice *device;
+  OpenCLDevice *device;
 
-	DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
+  DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
 
-	typedef unordered_map<string, Allocation> AllocationsMap;
-	AllocationsMap allocations;
+  typedef unordered_map<string, Allocation> AllocationsMap;
+  AllocationsMap allocations;
 
-	bool need_update;
+  bool need_update;
 
-	DeviceBuffer* smallest_device_buffer();
+  DeviceBuffer *smallest_device_buffer();
 
-public:
-	MemoryManager(OpenCLDevice *device);
+ public:
+  MemoryManager(OpenCLDevice *device);
 
-	void free(); /* Free all memory. */
+  void free(); /* Free all memory. */
 
-	void alloc(const char *name, device_memory& mem);
-	bool free(device_memory& mem);
+  void alloc(const char *name, device_memory &mem);
+  bool free(device_memory &mem);
 
-	BufferDescriptor get_descriptor(string name);
+  BufferDescriptor get_descriptor(string name);
 
-	void update_device_memory();
-	void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
+  void update_device_memory();
+  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 89761293638..e7bafa0b8a8 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -16,645 +16,641 @@
 
 #ifdef WITH_OPENCL
 
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_split_kernel.h"
+#  include "device/device.h"
+#  include "device/device_denoising.h"
+#  include "device/device_split_kernel.h"
 
-#include "util/util_map.h"
-#include "util/util_param.h"
-#include "util/util_string.h"
+#  include "util/util_map.h"
+#  include "util/util_param.h"
+#  include "util/util_string.h"
 
-#include "clew.h"
+#  include "clew.h"
 
-#include "device/opencl/memory_manager.h"
+#  include "device/opencl/memory_manager.h"
 
 CCL_NAMESPACE_BEGIN
 
 /* Disable workarounds, seems to be working fine on latest drivers. */
-#define CYCLES_DISABLE_DRIVER_WORKAROUNDS
+#  define CYCLES_DISABLE_DRIVER_WORKAROUNDS
 
 /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
-#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
+#  ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
 /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
-#  undef clEnqueueNDRangeKernel
-#  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
-	CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
-	clFinish(a);
+#    undef clEnqueueNDRangeKernel
+#    define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
+      CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
+      clFinish(a);
 
-#  undef clEnqueueWriteBuffer
-#  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
-	CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
-	clFinish(a);
+#    undef clEnqueueWriteBuffer
+#    define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
+      CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
+      clFinish(a);
 
-#  undef clEnqueueReadBuffer
-#  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
-	CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
-	clFinish(a);
-#endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
+#    undef clEnqueueReadBuffer
+#    define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
+      CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
+      clFinish(a);
+#  endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
 
-#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
+#  define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
 
 struct OpenCLPlatformDevice {
-	OpenCLPlatformDevice(cl_platform_id platform_id,
-	                     const string& platform_name,
-	                     cl_device_id device_id,
-	                     cl_device_type device_type,
-	                     const string& device_name,
-	                     const string& hardware_id,
-		                 const string& device_extensions)
-	  : platform_id(platform_id),
-	    platform_name(platform_name),
-	    device_id(device_id),
-	    device_type(device_type),
-	    device_name(device_name),
-	    hardware_id(hardware_id),
-	    device_extensions(device_extensions) {}
-	cl_platform_id platform_id;
-	string platform_name;
-	cl_device_id device_id;
-	cl_device_type device_type;
-	string device_name;
-	string hardware_id;
-	string device_extensions;
+  OpenCLPlatformDevice(cl_platform_id platform_id,
+                       const string &platform_name,
+                       cl_device_id device_id,
+                       cl_device_type device_type,
+                       const string &device_name,
+                       const string &hardware_id,
+                       const string &device_extensions)
+      : platform_id(platform_id),
+        platform_name(platform_name),
+        device_id(device_id),
+        device_type(device_type),
+        device_name(device_name),
+        hardware_id(hardware_id),
+        device_extensions(device_extensions)
+  {
+  }
+  cl_platform_id platform_id;
+  string platform_name;
+  cl_device_id device_id;
+  cl_device_type device_type;
+  string device_name;
+  string hardware_id;
+  string device_extensions;
 };
 
 /* Contains all static OpenCL helper functions. */
-class OpenCLInfo
-{
-public:
-	static cl_device_type device_type();
-	static bool use_debug();
-	static bool device_supported(const string& platform_name,
-	                             const cl_device_id device_id);
-	static bool platform_version_check(cl_platform_id platform,
-	                                   string *error = NULL);
-	static bool device_version_check(cl_device_id device,
-	                                 string *error = NULL);
-	static string get_hardware_id(const string& platform_name,
-	                              cl_device_id device_id);
-	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
-	                               bool force_all = false);
-
-	/* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
-
-	/* Platform information. */
-	static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
-	static cl_uint get_num_platforms();
-
-	static bool get_platforms(vector<cl_platform_id> *platform_ids,
-	                          cl_int *error = NULL);
-	static vector<cl_platform_id> get_platforms();
-
-	static bool get_platform_name(cl_platform_id platform_id,
-	                              string *platform_name);
-	static string get_platform_name(cl_platform_id platform_id);
-
-	static bool get_num_platform_devices(cl_platform_id platform_id,
-	                                     cl_device_type device_type,
-	                                     cl_uint *num_devices,
-	                                     cl_int *error = NULL);
-	static cl_uint get_num_platform_devices(cl_platform_id platform_id,
-	                                        cl_device_type device_type);
-
-	static bool get_platform_devices(cl_platform_id platform_id,
-	                                 cl_device_type device_type,
-	                                 vector<cl_device_id> *device_ids,
-	                                 cl_int* error = NULL);
-	static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
-	                                                 cl_device_type device_type);
-
-	/* Device information. */
-	static bool get_device_name(cl_device_id device_id,
-	                            string *device_name,
-	                            cl_int* error = NULL);
-
-	static string get_device_name(cl_device_id device_id);
-
-	static bool get_device_extensions(cl_device_id device_id,
-	                                  string *device_extensions,
-	                                  cl_int* error = NULL);
-
-	static string get_device_extensions(cl_device_id device_id);
-
-	static bool get_device_type(cl_device_id device_id,
-	                            cl_device_type *device_type,
-	                            cl_int* error = NULL);
-	static cl_device_type get_device_type(cl_device_id device_id);
-
-	static bool get_driver_version(cl_device_id device_id,
-	                               int *major,
-	                               int *minor,
-	                               cl_int* error = NULL);
-
-	static int mem_sub_ptr_alignment(cl_device_id device_id);
-
-	/* Get somewhat more readable device name.
-	 * Main difference is AMD OpenCL here which only gives code name
-	 * for the regular device name. This will give more sane device
-	 * name using some extensions.
-	 */
-	static string get_readable_device_name(cl_device_id device_id);
+class OpenCLInfo {
+ public:
+  static cl_device_type device_type();
+  static bool use_debug();
+  static bool device_supported(const string &platform_name, const cl_device_id device_id);
+  static bool platform_version_check(cl_platform_id platform, string *error = NULL);
+  static bool device_version_check(cl_device_id device, string *error = NULL);
+  static string get_hardware_id(const string &platform_name, cl_device_id device_id);
+  static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
+                                 bool force_all = false);
+
+  /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
+
+  /* Platform information. */
+  static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
+  static cl_uint get_num_platforms();
+
+  static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL);
+  static vector<cl_platform_id> get_platforms();
+
+  static bool get_platform_name(cl_platform_id platform_id, string *platform_name);
+  static string get_platform_name(cl_platform_id platform_id);
+
+  static bool get_num_platform_devices(cl_platform_id platform_id,
+                                       cl_device_type device_type,
+                                       cl_uint *num_devices,
+                                       cl_int *error = NULL);
+  static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type);
+
+  static bool get_platform_devices(cl_platform_id platform_id,
+                                   cl_device_type device_type,
+                                   vector<cl_device_id> *device_ids,
+                                   cl_int *error = NULL);
+  static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
+                                                   cl_device_type device_type);
+
+  /* Device information. */
+  static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL);
+
+  static string get_device_name(cl_device_id device_id);
+
+  static bool get_device_extensions(cl_device_id device_id,
+                                    string *device_extensions,
+                                    cl_int *error = NULL);
+
+  static string get_device_extensions(cl_device_id device_id);
+
+  static bool get_device_type(cl_device_id device_id,
+                              cl_device_type *device_type,
+                              cl_int *error = NULL);
+  static cl_device_type get_device_type(cl_device_id device_id);
+
+  static bool get_driver_version(cl_device_id device_id,
+                                 int *major,
+                                 int *minor,
+                                 cl_int *error = NULL);
+
+  static int mem_sub_ptr_alignment(cl_device_id device_id);
+
+  /* Get somewhat more readable device name.
+   * Main difference is AMD OpenCL here which only gives code name
+   * for the regular device name. This will give more sane device
+   * name using some extensions.
+   */
+  static string get_readable_device_name(cl_device_id device_id);
 };
 
 /* Thread safe cache for contexts and programs.
  */
-class OpenCLCache
-{
-	struct Slot
-	{
-		struct ProgramEntry
-		{
-			ProgramEntry();
-			ProgramEntry(const ProgramEntry& rhs);
-			~ProgramEntry();
-			cl_program program;
-			thread_mutex *mutex;
-		};
-
-		Slot();
-		Slot(const Slot& rhs);
-		~Slot();
-
-		thread_mutex *context_mutex;
-		cl_context context;
-		typedef map<ustring, ProgramEntry> EntryMap;
-		EntryMap programs;
-
-	};
-
-	/* key is combination of platform ID and device ID */
-	typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
-
-	/* map of Slot objects */
-	typedef map<PlatformDevicePair, Slot> CacheMap;
-	CacheMap cache;
-
-	/* MD5 hash of the kernel source. */
-	string kernel_md5;
-
-	thread_mutex cache_lock;
-	thread_mutex kernel_md5_lock;
-
-	/* lazy instantiate */
-	static OpenCLCache& global_instance();
-
-public:
-
-	enum ProgramName {
-		OCL_DEV_BASE_PROGRAM,
-		OCL_DEV_MEGAKERNEL_PROGRAM,
-	};
-
-	/* Lookup context in the cache. If this returns NULL, slot_locker
-	 * will be holding a lock for the cache. slot_locker should refer to a
-	 * default constructed thread_scoped_lock. */
-	static cl_context get_context(cl_platform_id platform,
-	                              cl_device_id device,
-	                              thread_scoped_lock& slot_locker);
-	/* Same as above. */
-	static cl_program get_program(cl_platform_id platform,
-	                              cl_device_id device,
-	                              ustring key,
-	                              thread_scoped_lock& slot_locker);
-
-	/* Store context in the cache. You MUST have tried to get the item before storing to it. */
-	static void store_context(cl_platform_id platform,
-	                          cl_device_id device,
-	                          cl_context context,
-	                          thread_scoped_lock& slot_locker);
-	/* Same as above. */
-	static void store_program(cl_platform_id platform,
-	                          cl_device_id device,
-	                          cl_program program,
-	                          ustring key,
-	                          thread_scoped_lock& slot_locker);
-
-	static string get_kernel_md5();
+class OpenCLCache {
+  struct Slot {
+    struct ProgramEntry {
+      ProgramEntry();
+      ProgramEntry(const ProgramEntry &rhs);
+      ~ProgramEntry();
+      cl_program program;
+      thread_mutex *mutex;
+    };
+
+    Slot();
+    Slot(const Slot &rhs);
+    ~Slot();
+
+    thread_mutex *context_mutex;
+    cl_context context;
+    typedef map<ustring, ProgramEntry> EntryMap;
+    EntryMap programs;
+  };
+
+  /* key is combination of platform ID and device ID */
+  typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
+
+  /* map of Slot objects */
+  typedef map<PlatformDevicePair, Slot> CacheMap;
+  CacheMap cache;
+
+  /* MD5 hash of the kernel source. */
+  string kernel_md5;
+
+  thread_mutex cache_lock;
+  thread_mutex kernel_md5_lock;
+
+  /* lazy instantiate */
+  static OpenCLCache &global_instance();
+
+ public:
+  enum ProgramName {
+    OCL_DEV_BASE_PROGRAM,
+    OCL_DEV_MEGAKERNEL_PROGRAM,
+  };
+
+  /* Lookup context in the cache. If this returns NULL, slot_locker
+   * will be holding a lock for the cache. slot_locker should refer to a
+   * default constructed thread_scoped_lock. */
+  static cl_context get_context(cl_platform_id platform,
+                                cl_device_id device,
+                                thread_scoped_lock &slot_locker);
+  /* Same as above. */
+  static cl_program get_program(cl_platform_id platform,
+                                cl_device_id device,
+                                ustring key,
+                                thread_scoped_lock &slot_locker);
+
+  /* Store context in the cache. You MUST have tried to get the item before storing to it. */
+  static void store_context(cl_platform_id platform,
+                            cl_device_id device,
+                            cl_context context,
+                            thread_scoped_lock &slot_locker);
+  /* Same as above. */
+  static void store_program(cl_platform_id platform,
+                            cl_device_id device,
+                            cl_program program,
+                            ustring key,
+                            thread_scoped_lock &slot_locker);
+
+  static string get_kernel_md5();
 };
 
-#define opencl_device_assert(device, stmt) \
-	{ \
-		cl_int err = stmt; \
-		\
-		if(err != CL_SUCCESS) { \
-			string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-			if((device)->error_message() == "") \
-				(device)->set_error(message); \
-			fprintf(stderr, "%s\n", message.c_str()); \
-		} \
-	} (void) 0
-
-#define opencl_assert(stmt) \
-	{ \
-		cl_int err = stmt; \
-		\
-		if(err != CL_SUCCESS) { \
-			string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-			if(error_msg == "") \
-				error_msg = message; \
-			fprintf(stderr, "%s\n", message.c_str()); \
-		} \
-	} (void) 0
-
-class OpenCLDevice : public Device
-{
-public:
-	DedicatedTaskPool task_pool;
-
-	/* Task pool for required kernels (base, AO kernels during foreground rendering) */
-	TaskPool load_required_kernel_task_pool;
-	/* Task pool for optional kernels (feature kernels during foreground rendering) */
-	TaskPool load_kernel_task_pool;
-	cl_context cxContext;
-	cl_command_queue cqCommandQueue;
-	cl_platform_id cpPlatform;
-	cl_device_id cdDevice;
-	cl_int ciErr;
-	int device_num;
-	bool use_preview_kernels;
-
-	class OpenCLProgram {
-	public:
-		OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL) {}
-		OpenCLProgram(OpenCLDevice *device,
-		              const string& program_name,
-		              const string& kernel_name,
-		              const string& kernel_build_options,
-		              bool use_stdout = true);
-		~OpenCLProgram();
-
-		void add_kernel(ustring name);
-
-		/* Try to load the program from device cache or disk */
-		bool load();
-		/* Compile the kernel (first separate, failback to local) */
-		void compile();
-		/* Create the OpenCL kernels after loading or compiling */
-		void create_kernels();
-
-		bool is_loaded() const { return loaded; }
-		const string& get_log() const { return log; }
-		void report_error();
-
-		/* Wait until this kernel is available to be used
-		 * It will return true when the kernel is available.
-		 * It will return false when the kernel is not available
-		 * or could not be loaded. */
-		bool wait_for_availability();
-
-		cl_kernel operator()();
-		cl_kernel operator()(ustring name);
-
-		void release();
-
-	private:
-		bool build_kernel(const string *debug_src);
-		/* Build the program by calling the own process.
-		 * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
-		 * build calls internally if they come from the same process.
-		 * If that is not supported, this function just returns false.
-		 */
-		bool compile_separate(const string& clbin);
-		/* Build the program by calling OpenCL directly. */
-		bool compile_kernel(const string *debug_src);
-		/* Loading and saving the program from/to disk. */
-		bool load_binary(const string& clbin, const string *debug_src = NULL);
-		bool save_binary(const string& clbin);
-
-		void add_log(const string& msg, bool is_debug);
-		void add_error(const string& msg);
-
-		bool loaded;
-		bool needs_compiling;
-
-		cl_program program;
-		OpenCLDevice *device;
-
-		/* Used for the OpenCLCache key. */
-		string program_name;
-
-		string kernel_file, kernel_build_options, device_md5;
-
-		bool use_stdout;
-		string log, error_msg;
-		string compile_output;
-
-		map<ustring, cl_kernel> kernels;
-	};
-
-	/* Container for all types of split programs. */
-	class OpenCLSplitPrograms {
-		public:
-			OpenCLDevice *device;
-			OpenCLProgram program_split;
-			OpenCLProgram program_lamp_emission;
-			OpenCLProgram program_do_volume;
-			OpenCLProgram program_indirect_background;
-			OpenCLProgram program_shader_eval;
-			OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-			OpenCLProgram program_subsurface_scatter;
-			OpenCLProgram program_direct_lighting;
-			OpenCLProgram program_shadow_blocked_ao;
-			OpenCLProgram program_shadow_blocked_dl;
-
-			OpenCLSplitPrograms(OpenCLDevice *device);
-			~OpenCLSplitPrograms();
-
-			/* Load the kernels and put the created kernels in the given `programs`
-			 * paramter. */
-			void load_kernels(vector<OpenCLProgram*> &programs,
-			                  const DeviceRequestedFeatures& requested_features,
-			                  bool is_preview=false);
-	};
-
-	DeviceSplitKernel *split_kernel;
-
-	OpenCLProgram base_program;
-	OpenCLProgram bake_program;
-	OpenCLProgram displace_program;
-	OpenCLProgram background_program;
-	OpenCLProgram denoising_program;
-
-	OpenCLSplitPrograms kernel_programs;
-	OpenCLSplitPrograms preview_programs;
-
-	typedef map<string, device_vector<uchar>*> ConstMemMap;
-	typedef map<string, device_ptr> MemMap;
-
-	ConstMemMap const_mem_map;
-	MemMap mem_map;
-	device_ptr null_mem;
-
-	bool device_initialized;
-	string platform_name;
-	string device_name;
-
-	bool opencl_error(cl_int err);
-	void opencl_error(const string& message);
-	void opencl_assert_err(cl_int err, const char* where);
-
-	OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
-	~OpenCLDevice();
-
-	static void CL_CALLBACK context_notify_callback(const char *err_info,
-		const void * /*private_info*/, size_t /*cb*/, void *user_data);
-
-	bool opencl_version_check();
-	OpenCLSplitPrograms* get_split_programs();
-
-	string device_md5_hash(string kernel_custom_build_options = "");
-	bool load_kernels(const DeviceRequestedFeatures& requested_features);
-	void load_required_kernels(const DeviceRequestedFeatures& requested_features);
-	void load_preview_kernels();
-
-	bool wait_for_availability(const DeviceRequestedFeatures& requested_features);
-	DeviceKernelStatus get_active_kernel_switch_state();
-
-	/* Get the name of the opencl program for the given kernel */
-	const string get_opencl_program_name(const string& kernel_name);
-	/* Get the program file name to compile (*.cl) for the given kernel */
-	const string get_opencl_program_filename(const string& kernel_name);
-	string get_build_options(const DeviceRequestedFeatures& requested_features,
-	                         const string& opencl_program_name,
-	                         bool preview_kernel=false);
-	/* Enable the default features to reduce recompilation events */
-	void enable_default_features(DeviceRequestedFeatures& features);
-
-	void mem_alloc(device_memory& mem);
-	void mem_copy_to(device_memory& mem);
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
-	void mem_zero(device_memory& mem);
-	void mem_free(device_memory& mem);
-
-	int mem_sub_ptr_alignment();
-
-	void const_copy_to(const char *name, void *host, size_t size);
-	void tex_alloc(device_memory& mem);
-	void tex_free(device_memory& mem);
-
-	size_t global_size_round_up(int group_size, int global_size);
-	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h,
-	                    bool x_workgroups = false,
-	                    size_t max_workgroup_size = -1);
-	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
-	void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-
-	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
-	void shader(DeviceTask& task);
-
-	void denoise(RenderTile& tile, DenoisingTask& denoising);
-
-	class OpenCLDeviceTask : public DeviceTask {
-	public:
-		OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task)
-		: DeviceTask(task)
-		{
-			run = function_bind(&OpenCLDevice::thread_run,
-			                    device,
-			                    this);
-		}
-	};
-
-	int get_split_task_count(DeviceTask& /*task*/)
-	{
-		return 1;
-	}
-
-	void task_add(DeviceTask& task)
-	{
-		task_pool.push(new OpenCLDeviceTask(this, task));
-	}
-
-	void task_wait()
-	{
-		task_pool.wait();
-	}
-
-	void task_cancel()
-	{
-		task_pool.cancel();
-	}
-
-	void thread_run(DeviceTask *task);
-
-	virtual BVHLayoutMask get_bvh_layout_mask() const {
-		return BVH_LAYOUT_BVH2;
-	}
-
-	virtual bool show_samples() const {
-		return true;
-	}
-
-
-protected:
-	string kernel_build_options(const string *debug_src = NULL);
-
-	void mem_zero_kernel(device_ptr ptr, size_t size);
-
-	bool denoising_non_local_means(device_ptr image_ptr,
-	                               device_ptr guide_ptr,
-	                               device_ptr variance_ptr,
-	                               device_ptr out_ptr,
-	                               DenoisingTask *task);
-	bool denoising_construct_transform(DenoisingTask *task);
-	bool denoising_accumulate(device_ptr color_ptr,
-	                          device_ptr color_variance_ptr,
-	                          device_ptr scale_ptr,
-	                          int frame,
-	                          DenoisingTask *task);
-	bool denoising_solve(device_ptr output_ptr,
-	                     DenoisingTask *task);
-	bool denoising_combine_halves(device_ptr a_ptr,
-	                              device_ptr b_ptr,
-	                              device_ptr mean_ptr,
-	                              device_ptr variance_ptr,
-	                              int r, int4 rect,
-	                              DenoisingTask *task);
-	bool denoising_divide_shadow(device_ptr a_ptr,
-	                             device_ptr b_ptr,
-	                             device_ptr sample_variance_ptr,
-	                             device_ptr sv_variance_ptr,
-	                             device_ptr buffer_variance_ptr,
-	                             DenoisingTask *task);
-	bool denoising_get_feature(int mean_offset,
-	                           int variance_offset,
-	                           device_ptr mean_ptr,
-	                           device_ptr variance_ptr,
-	                           float scale,
-	                           DenoisingTask *task);
-	bool denoising_write_feature(int to_offset,
-	                             device_ptr from_ptr,
-	                             device_ptr buffer_ptr,
-	                             DenoisingTask *task);
-	bool denoising_detect_outliers(device_ptr image_ptr,
-	                               device_ptr variance_ptr,
-	                               device_ptr depth_ptr,
-	                               device_ptr output_ptr,
-	                               DenoisingTask *task);
-
-	device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size);
-	void mem_free_sub_ptr(device_ptr ptr);
-
-	class ArgumentWrapper {
-	public:
-		ArgumentWrapper() : size(0), pointer(NULL)
-		{
-		}
-
-		ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
-		                                           pointer((void*)(&argument.device_pointer))
-		{
-		}
-
-		template<typename T>
-		ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
-		                                              pointer((void*)(&argument.device_pointer))
-		{
-		}
-
-		template<typename T>
-		ArgumentWrapper(device_only_memory<T>& argument) : size(sizeof(void*)),
-		                                                   pointer((void*)(&argument.device_pointer))
-		{
-		}
-		template<typename T>
-		ArgumentWrapper(T& argument) : size(sizeof(argument)),
-		                               pointer(&argument)
-		{
-		}
-
-		ArgumentWrapper(int argument) : size(sizeof(int)),
-		                                int_value(argument),
-		                                pointer(&int_value)
-		{
-		}
-
-		ArgumentWrapper(float argument) : size(sizeof(float)),
-		                                  float_value(argument),
-		                                  pointer(&float_value)
-		{
-		}
-
-		size_t size;
-		int int_value;
-		float float_value;
-		void *pointer;
-	};
-
-	/* TODO(sergey): In the future we can use variadic templates, once
-	 * C++0x is allowed. Should allow to clean this up a bit.
-	 */
-	int kernel_set_args(cl_kernel kernel,
-	                    int start_argument_index,
-	                    const ArgumentWrapper& arg1 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg2 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg3 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg4 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg5 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg6 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg7 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg8 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg9 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg10 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg11 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg12 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg13 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg14 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg15 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg16 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg17 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg18 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg19 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg20 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg21 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg22 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg23 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg24 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg25 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg26 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg27 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg28 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg29 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg30 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg31 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg32 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg33 = ArgumentWrapper());
-
-	void release_kernel_safe(cl_kernel kernel);
-	void release_mem_object_safe(cl_mem mem);
-	void release_program_safe(cl_program program);
-
-	/* ** Those guys are for workign around some compiler-specific bugs ** */
-
-	cl_program load_cached_kernel(
-	        ustring key,
-	        thread_scoped_lock& cache_locker);
-
-	void store_cached_kernel(
-	        cl_program program,
-	        ustring key,
-	        thread_scoped_lock& cache_locker);
-
-private:
-	MemoryManager memory_manager;
-	friend class MemoryManager;
-
-	static_assert_align(TextureInfo, 16);
-	device_vector<TextureInfo> texture_info;
-
-	typedef map<string, device_memory*> TexturesMap;
-	TexturesMap textures;
-
-	bool textures_need_update;
-
-protected:
-	void flush_texture_buffers();
-
-	friend class OpenCLSplitKernel;
-	friend class OpenCLSplitKernelFunction;
+#  define opencl_device_assert(device, stmt) \
+    { \
+      cl_int err = stmt; \
+\
+      if (err != CL_SUCCESS) { \
+        string message = string_printf( \
+            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
+        if ((device)->error_message() == "") \
+          (device)->set_error(message); \
+        fprintf(stderr, "%s\n", message.c_str()); \
+      } \
+    } \
+    (void)0
+
+#  define opencl_assert(stmt) \
+    { \
+      cl_int err = stmt; \
+\
+      if (err != CL_SUCCESS) { \
+        string message = string_printf( \
+            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
+        if (error_msg == "") \
+          error_msg = message; \
+        fprintf(stderr, "%s\n", message.c_str()); \
+      } \
+    } \
+    (void)0
+
+class OpenCLDevice : public Device {
+ public:
+  DedicatedTaskPool task_pool;
+
+  /* Task pool for required kernels (base, AO kernels during foreground rendering) */
+  TaskPool load_required_kernel_task_pool;
+  /* Task pool for optional kernels (feature kernels during foreground rendering) */
+  TaskPool load_kernel_task_pool;
+  cl_context cxContext;
+  cl_command_queue cqCommandQueue;
+  cl_platform_id cpPlatform;
+  cl_device_id cdDevice;
+  cl_int ciErr;
+  int device_num;
+  bool use_preview_kernels;
+
+  class OpenCLProgram {
+   public:
+    OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL)
+    {
+    }
+    OpenCLProgram(OpenCLDevice *device,
+                  const string &program_name,
+                  const string &kernel_name,
+                  const string &kernel_build_options,
+                  bool use_stdout = true);
+    ~OpenCLProgram();
+
+    void add_kernel(ustring name);
+
+    /* Try to load the program from device cache or disk */
+    bool load();
+    /* Compile the kernel (first separate, failback to local) */
+    void compile();
+    /* Create the OpenCL kernels after loading or compiling */
+    void create_kernels();
+
+    bool is_loaded() const
+    {
+      return loaded;
+    }
+    const string &get_log() const
+    {
+      return log;
+    }
+    void report_error();
+
+    /* Wait until this kernel is available to be used
+     * It will return true when the kernel is available.
+     * It will return false when the kernel is not available
+     * or could not be loaded. */
+    bool wait_for_availability();
+
+    cl_kernel operator()();
+    cl_kernel operator()(ustring name);
+
+    void release();
+
+   private:
+    bool build_kernel(const string *debug_src);
+    /* Build the program by calling the own process.
+     * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
+     * build calls internally if they come from the same process.
+     * If that is not supported, this function just returns false.
+     */
+    bool compile_separate(const string &clbin);
+    /* Build the program by calling OpenCL directly. */
+    bool compile_kernel(const string *debug_src);
+    /* Loading and saving the program from/to disk. */
+    bool load_binary(const string &clbin, const string *debug_src = NULL);
+    bool save_binary(const string &clbin);
+
+    void add_log(const string &msg, bool is_debug);
+    void add_error(const string &msg);
+
+    bool loaded;
+    bool needs_compiling;
+
+    cl_program program;
+    OpenCLDevice *device;
+
+    /* Used for the OpenCLCache key. */
+    string program_name;
+
+    string kernel_file, kernel_build_options, device_md5;
+
+    bool use_stdout;
+    string log, error_msg;
+    string compile_output;
+
+    map<ustring, cl_kernel> kernels;
+  };
+
+  /* Container for all types of split programs. */
+  class OpenCLSplitPrograms {
+   public:
+    OpenCLDevice *device;
+    OpenCLProgram program_split;
+    OpenCLProgram program_lamp_emission;
+    OpenCLProgram program_do_volume;
+    OpenCLProgram program_indirect_background;
+    OpenCLProgram program_shader_eval;
+    OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
+    OpenCLProgram program_subsurface_scatter;
+    OpenCLProgram program_direct_lighting;
+    OpenCLProgram program_shadow_blocked_ao;
+    OpenCLProgram program_shadow_blocked_dl;
+
+    OpenCLSplitPrograms(OpenCLDevice *device);
+    ~OpenCLSplitPrograms();
+
+    /* Load the kernels and put the created kernels in the given `programs`
+       * paramter. */
+    void load_kernels(vector<OpenCLProgram *> &programs,
+                      const DeviceRequestedFeatures &requested_features,
+                      bool is_preview = false);
+  };
+
+  DeviceSplitKernel *split_kernel;
+
+  OpenCLProgram base_program;
+  OpenCLProgram bake_program;
+  OpenCLProgram displace_program;
+  OpenCLProgram background_program;
+  OpenCLProgram denoising_program;
+
+  OpenCLSplitPrograms kernel_programs;
+  OpenCLSplitPrograms preview_programs;
+
+  typedef map<string, device_vector<uchar> *> ConstMemMap;
+  typedef map<string, device_ptr> MemMap;
+
+  ConstMemMap const_mem_map;
+  MemMap mem_map;
+  device_ptr null_mem;
+
+  bool device_initialized;
+  string platform_name;
+  string device_name;
+
+  bool opencl_error(cl_int err);
+  void opencl_error(const string &message);
+  void opencl_assert_err(cl_int err, const char *where);
+
+  OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+  ~OpenCLDevice();
+
+  static void CL_CALLBACK context_notify_callback(const char *err_info,
+                                                  const void * /*private_info*/,
+                                                  size_t /*cb*/,
+                                                  void *user_data);
+
+  bool opencl_version_check();
+  OpenCLSplitPrograms *get_split_programs();
+
+  string device_md5_hash(string kernel_custom_build_options = "");
+  bool load_kernels(const DeviceRequestedFeatures &requested_features);
+  void load_required_kernels(const DeviceRequestedFeatures &requested_features);
+  void load_preview_kernels();
+
+  bool wait_for_availability(const DeviceRequestedFeatures &requested_features);
+  DeviceKernelStatus get_active_kernel_switch_state();
+
+  /* Get the name of the opencl program for the given kernel */
+  const string get_opencl_program_name(const string &kernel_name);
+  /* Get the program file name to compile (*.cl) for the given kernel */
+  const string get_opencl_program_filename(const string &kernel_name);
+  string get_build_options(const DeviceRequestedFeatures &requested_features,
+                           const string &opencl_program_name,
+                           bool preview_kernel = false);
+  /* Enable the default features to reduce recompilation events */
+  void enable_default_features(DeviceRequestedFeatures &features);
+
+  void mem_alloc(device_memory &mem);
+  void mem_copy_to(device_memory &mem);
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
+  void mem_zero(device_memory &mem);
+  void mem_free(device_memory &mem);
+
+  int mem_sub_ptr_alignment();
+
+  void const_copy_to(const char *name, void *host, size_t size);
+  void tex_alloc(device_memory &mem);
+  void tex_free(device_memory &mem);
+
+  size_t global_size_round_up(int group_size, int global_size);
+  void enqueue_kernel(cl_kernel kernel,
+                      size_t w,
+                      size_t h,
+                      bool x_workgroups = false,
+                      size_t max_workgroup_size = -1);
+  void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
+  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
+
+  void film_convert(DeviceTask &task,
+                    device_ptr buffer,
+                    device_ptr rgba_byte,
+                    device_ptr rgba_half);
+  void shader(DeviceTask &task);
+
+  void denoise(RenderTile &tile, DenoisingTask &denoising);
+
+  class OpenCLDeviceTask : public DeviceTask {
+   public:
+    OpenCLDeviceTask(OpenCLDevice *device, DeviceTask &task) : DeviceTask(task)
+    {
+      run = function_bind(&OpenCLDevice::thread_run, device, this);
+    }
+  };
+
+  int get_split_task_count(DeviceTask & /*task*/)
+  {
+    return 1;
+  }
+
+  void task_add(DeviceTask &task)
+  {
+    task_pool.push(new OpenCLDeviceTask(this, task));
+  }
+
+  void task_wait()
+  {
+    task_pool.wait();
+  }
+
+  void task_cancel()
+  {
+    task_pool.cancel();
+  }
+
+  void thread_run(DeviceTask *task);
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const
+  {
+    return BVH_LAYOUT_BVH2;
+  }
+
+  virtual bool show_samples() const
+  {
+    return true;
+  }
+
+ protected:
+  string kernel_build_options(const string *debug_src = NULL);
+
+  void mem_zero_kernel(device_ptr ptr, size_t size);
+
+  bool denoising_non_local_means(device_ptr image_ptr,
+                                 device_ptr guide_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr out_ptr,
+                                 DenoisingTask *task);
+  bool denoising_construct_transform(DenoisingTask *task);
+  bool denoising_accumulate(device_ptr color_ptr,
+                            device_ptr color_variance_ptr,
+                            device_ptr scale_ptr,
+                            int frame,
+                            DenoisingTask *task);
+  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
+  bool denoising_combine_halves(device_ptr a_ptr,
+                                device_ptr b_ptr,
+                                device_ptr mean_ptr,
+                                device_ptr variance_ptr,
+                                int r,
+                                int4 rect,
+                                DenoisingTask *task);
+  bool denoising_divide_shadow(device_ptr a_ptr,
+                               device_ptr b_ptr,
+                               device_ptr sample_variance_ptr,
+                               device_ptr sv_variance_ptr,
+                               device_ptr buffer_variance_ptr,
+                               DenoisingTask *task);
+  bool denoising_get_feature(int mean_offset,
+                             int variance_offset,
+                             device_ptr mean_ptr,
+                             device_ptr variance_ptr,
+                             float scale,
+                             DenoisingTask *task);
+  bool denoising_write_feature(int to_offset,
+                               device_ptr from_ptr,
+                               device_ptr buffer_ptr,
+                               DenoisingTask *task);
+  bool denoising_detect_outliers(device_ptr image_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr depth_ptr,
+                                 device_ptr output_ptr,
+                                 DenoisingTask *task);
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size);
+  void mem_free_sub_ptr(device_ptr ptr);
+
+  class ArgumentWrapper {
+   public:
+    ArgumentWrapper() : size(0), pointer(NULL)
+    {
+    }
+
+    ArgumentWrapper(device_memory &argument)
+        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
+    {
+    }
+
+    template<typename T>
+    ArgumentWrapper(device_vector<T> &argument)
+        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
+    {
+    }
+
+    template<typename T>
+    ArgumentWrapper(device_only_memory<T> &argument)
+        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
+    {
+    }
+    template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument)
+    {
+    }
+
+    ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value)
+    {
+    }
+
+    ArgumentWrapper(float argument)
+        : size(sizeof(float)), float_value(argument), pointer(&float_value)
+    {
+    }
+
+    size_t size;
+    int int_value;
+    float float_value;
+    void *pointer;
+  };
+
+  /* TODO(sergey): In the future we can use variadic templates, once
+   * C++0x is allowed. Should allow to clean this up a bit.
+   */
+  int kernel_set_args(cl_kernel kernel,
+                      int start_argument_index,
+                      const ArgumentWrapper &arg1 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg2 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg3 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg4 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg5 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg6 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg7 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg8 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg9 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg10 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg11 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg12 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg13 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg14 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg15 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg16 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg17 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg18 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg19 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg20 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg21 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg22 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg23 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg24 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg25 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg26 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg27 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg28 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg29 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg30 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg31 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg32 = ArgumentWrapper(),
+                      const ArgumentWrapper &arg33 = ArgumentWrapper());
+
+  void release_kernel_safe(cl_kernel kernel);
+  void release_mem_object_safe(cl_mem mem);
+  void release_program_safe(cl_program program);
+
+  /* ** Those guys are for workign around some compiler-specific bugs ** */
+
+  cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker);
+
+  void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker);
+
+ private:
+  MemoryManager memory_manager;
+  friend class MemoryManager;
+
+  static_assert_align(TextureInfo, 16);
+  device_vector<TextureInfo> texture_info;
+
+  typedef map<string, device_memory *> TexturesMap;
+  TexturesMap textures;
+
+  bool textures_need_update;
+
+ protected:
+  void flush_texture_buffers();
+
+  friend class OpenCLSplitKernel;
+  friend class OpenCLSplitKernelFunction;
 };
 
-Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background);
+Device *opencl_create_split_device(DeviceInfo &info,
+                                   Stats &stats,
+                                   Profiler &profiler,
+                                   bool background);
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 489d10b7087..70b1a643044 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -16,273 +16,278 @@
 
 #ifdef WITH_OPENCL
 
-#include "device/opencl/opencl.h"
+#  include "device/opencl/opencl.h"
 
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
+#  include "kernel/kernel_types.h"
+#  include "kernel/split/kernel_split_data_types.h"
 
-#include "util/util_algorithm.h"
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-#include "util/util_md5.h"
-#include "util/util_path.h"
-#include "util/util_time.h"
+#  include "util/util_algorithm.h"
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_md5.h"
+#  include "util/util_path.h"
+#  include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
 struct texture_slot_t {
-	texture_slot_t(const string& name, int slot)
-		: name(name),
-		  slot(slot) {
-	}
-	string name;
-	int slot;
+  texture_slot_t(const string &name, int slot) : name(name), slot(slot)
+  {
+  }
+  string name;
+  int slot;
 };
 
 static const string NON_SPLIT_KERNELS =
-	"denoising "
-	"base "
-	"background "
-	"displace ";
+    "denoising "
+    "base "
+    "background "
+    "displace ";
 
 static const string SPLIT_BUNDLE_KERNELS =
-	"data_init "
-	"path_init "
-	"state_buffer_size "
-	"scene_intersect "
-	"queue_enqueue "
-	"shader_setup "
-	"shader_sort "
-	"enqueue_inactive "
-	"next_iteration_setup "
-	"indirect_subsurface "
-	"buffer_update";
-
-const string OpenCLDevice::get_opencl_program_name(const string& kernel_name)
+    "data_init "
+    "path_init "
+    "state_buffer_size "
+    "scene_intersect "
+    "queue_enqueue "
+    "shader_setup "
+    "shader_sort "
+    "enqueue_inactive "
+    "next_iteration_setup "
+    "indirect_subsurface "
+    "buffer_update";
+
+const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
 {
-	if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
-		return kernel_name;
-	}
-	else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-		return "split_bundle";
-	}
-	else {
-		return "split_" + kernel_name;
-	}
+  if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
+    return kernel_name;
+  }
+  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
+    return "split_bundle";
+  }
+  else {
+    return "split_" + kernel_name;
+  }
 }
 
-const string OpenCLDevice::get_opencl_program_filename(const string& kernel_name)
+const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
 {
-	if (kernel_name == "denoising") {
-		return "filter.cl";
-	}
-	else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-		return "kernel_split_bundle.cl";
-	}
-	else {
-		return "kernel_" + kernel_name + ".cl";
-	}
+  if (kernel_name == "denoising") {
+    return "filter.cl";
+  }
+  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
+    return "kernel_split_bundle.cl";
+  }
+  else {
+    return "kernel_" + kernel_name + ".cl";
+  }
 }
 
 /* Enable features that we always want to compile to reduce recompilation events */
-void OpenCLDevice::enable_default_features(DeviceRequestedFeatures& features)
+void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
 {
-	features.use_transparent = true;
-	features.use_shadow_tricks = true;
-	features.use_principled = true;
-	features.use_denoising = true;
-
-	if (!background)
-	{
-		features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-		features.nodes_features = NODE_FEATURE_ALL;
-		features.use_hair = true;
-		features.use_subsurface = true;
-		features.use_camera_motion = false;
-		features.use_object_motion = false;
-	}
+  features.use_transparent = true;
+  features.use_shadow_tricks = true;
+  features.use_principled = true;
+  features.use_denoising = true;
+
+  if (!background) {
+    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
+    features.nodes_features = NODE_FEATURE_ALL;
+    features.use_hair = true;
+    features.use_subsurface = true;
+    features.use_camera_motion = false;
+    features.use_object_motion = false;
+  }
 }
 
-string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_features, const string& opencl_program_name, bool preview_kernel)
+string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
+                                       const string &opencl_program_name,
+                                       bool preview_kernel)
 {
-	/* first check for non-split kernel programs */
-	if (opencl_program_name == "base" || opencl_program_name == "denoising") {
-		return "";
-	}
-	else if (opencl_program_name == "bake") {
-		/* Note: get_build_options for bake is only requested when baking is enabled.
-		 * displace and background are always requested.
-		 * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
-		DeviceRequestedFeatures features(requested_features);
-		enable_default_features(features);
-		features.use_denoising = false;
-		features.use_object_motion = false;
-		features.use_camera_motion = false;
-		features.use_hair = true;
-		features.use_subsurface = true;
-		features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-		features.nodes_features = NODE_FEATURE_ALL;
-		features.use_integrator_branched = false;
-		return features.get_build_options();
-	}
-	else if (opencl_program_name == "displace") {
-		/* As displacement does not use any nodes from the Shading group (eg BSDF).
-		 * We disable all features that are related to shading. */
-		DeviceRequestedFeatures features(requested_features);
-		enable_default_features(features);
-		features.use_denoising = false;
-		features.use_object_motion = false;
-		features.use_camera_motion = false;
-		features.use_baking = false;
-		features.use_transparent = false;
-		features.use_shadow_tricks = false;
-		features.use_subsurface = false;
-		features.use_volume = false;
-		features.nodes_features &= ~NODE_FEATURE_VOLUME;
-		features.use_denoising = false;
-		features.use_principled = false;
-		features.use_integrator_branched = false;
-		return features.get_build_options();
-	}
-	else if (opencl_program_name == "background") {
-		/* Background uses Background shading
-		 * It is save to disable shadow features, subsurface and volumetric. */
-		DeviceRequestedFeatures features(requested_features);
-		enable_default_features(features);
-		features.use_baking = false;
-		features.use_object_motion = false;
-		features.use_camera_motion = false;
-		features.use_transparent = false;
-		features.use_shadow_tricks = false;
-		features.use_denoising = false;
-		/* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
-		 * Perhaps we should remove them in UI as it does not make any sense when
-		 * rendering background. */
-		features.nodes_features &= ~NODE_FEATURE_VOLUME;
-		features.use_subsurface = false;
-		features.use_volume = false;
-		features.use_shader_raytrace = false;
-		features.use_patch_evaluation = false;
-		features.use_integrator_branched = false;
-		return features.get_build_options();
-	}
-
-	string build_options = "-D__SPLIT_KERNEL__ ";
-	/* Set compute device build option. */
-	cl_device_type device_type;
-	OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
-	assert(this->ciErr == CL_SUCCESS);
-	if(device_type == CL_DEVICE_TYPE_GPU) {
-		build_options += "-D__COMPUTE_DEVICE_GPU__ ";
-	}
-
-	DeviceRequestedFeatures nofeatures;
-	enable_default_features(nofeatures);
-
-	/* Add program specific optimized compile directives */
-	if (preview_kernel) {
-		DeviceRequestedFeatures preview_features;
-		preview_features.use_hair = true;
-		build_options += "-D__KERNEL_AO_PREVIEW__ ";
-		build_options += preview_features.get_build_options();
-	}
-	else if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
-		build_options += nofeatures.get_build_options();
-	}
-	else {
-		DeviceRequestedFeatures features(requested_features);
-		enable_default_features(features);
-
-		/* Always turn off baking at this point. Baking is only usefull when building the bake kernel.
-		 * this also makes sure that the kernels that are build during baking can be reused
-		 * when not doing any baking. */
-		features.use_baking = false;
-
-		/* Do not vary on shaders when program doesn't do any shading.
-		 * We have bundled them in a single program. */
-		if (opencl_program_name == "split_bundle") {
-			features.max_nodes_group = 0;
-			features.nodes_features = 0;
-			features.use_shader_raytrace = false;
-		}
-
-		/* No specific settings, just add the regular ones */
-		build_options += features.get_build_options();
-	}
-
-	return build_options;
+  /* first check for non-split kernel programs */
+  if (opencl_program_name == "base" || opencl_program_name == "denoising") {
+    return "";
+  }
+  else if (opencl_program_name == "bake") {
+    /* Note: get_build_options for bake is only requested when baking is enabled.
+     * displace and background are always requested.
+     * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
+    DeviceRequestedFeatures features(requested_features);
+    enable_default_features(features);
+    features.use_denoising = false;
+    features.use_object_motion = false;
+    features.use_camera_motion = false;
+    features.use_hair = true;
+    features.use_subsurface = true;
+    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
+    features.nodes_features = NODE_FEATURE_ALL;
+    features.use_integrator_branched = false;
+    return features.get_build_options();
+  }
+  else if (opencl_program_name == "displace") {
+    /* As displacement does not use any nodes from the Shading group (eg BSDF).
+     * We disable all features that are related to shading. */
+    DeviceRequestedFeatures features(requested_features);
+    enable_default_features(features);
+    features.use_denoising = false;
+    features.use_object_motion = false;
+    features.use_camera_motion = false;
+    features.use_baking = false;
+    features.use_transparent = false;
+    features.use_shadow_tricks = false;
+    features.use_subsurface = false;
+    features.use_volume = false;
+    features.nodes_features &= ~NODE_FEATURE_VOLUME;
+    features.use_denoising = false;
+    features.use_principled = false;
+    features.use_integrator_branched = false;
+    return features.get_build_options();
+  }
+  else if (opencl_program_name == "background") {
+    /* Background uses Background shading
+     * It is save to disable shadow features, subsurface and volumetric. */
+    DeviceRequestedFeatures features(requested_features);
+    enable_default_features(features);
+    features.use_baking = false;
+    features.use_object_motion = false;
+    features.use_camera_motion = false;
+    features.use_transparent = false;
+    features.use_shadow_tricks = false;
+    features.use_denoising = false;
+    /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
+     * Perhaps we should remove them in UI as it does not make any sense when
+     * rendering background. */
+    features.nodes_features &= ~NODE_FEATURE_VOLUME;
+    features.use_subsurface = false;
+    features.use_volume = false;
+    features.use_shader_raytrace = false;
+    features.use_patch_evaluation = false;
+    features.use_integrator_branched = false;
+    return features.get_build_options();
+  }
+
+  string build_options = "-D__SPLIT_KERNEL__ ";
+  /* Set compute device build option. */
+  cl_device_type device_type;
+  OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
+  assert(this->ciErr == CL_SUCCESS);
+  if (device_type == CL_DEVICE_TYPE_GPU) {
+    build_options += "-D__COMPUTE_DEVICE_GPU__ ";
+  }
+
+  DeviceRequestedFeatures nofeatures;
+  enable_default_features(nofeatures);
+
+  /* Add program specific optimized compile directives */
+  if (preview_kernel) {
+    DeviceRequestedFeatures preview_features;
+    preview_features.use_hair = true;
+    build_options += "-D__KERNEL_AO_PREVIEW__ ";
+    build_options += preview_features.get_build_options();
+  }
+  else if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
+    build_options += nofeatures.get_build_options();
+  }
+  else {
+    DeviceRequestedFeatures features(requested_features);
+    enable_default_features(features);
+
+    /* Always turn off baking at this point. Baking is only usefull when building the bake kernel.
+     * this also makes sure that the kernels that are build during baking can be reused
+     * when not doing any baking. */
+    features.use_baking = false;
+
+    /* Do not vary on shaders when program doesn't do any shading.
+     * We have bundled them in a single program. */
+    if (opencl_program_name == "split_bundle") {
+      features.max_nodes_group = 0;
+      features.nodes_features = 0;
+      features.use_shader_raytrace = false;
+    }
+
+    /* No specific settings, just add the regular ones */
+    build_options += features.get_build_options();
+  }
+
+  return build_options;
 }
 
 OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
 {
-	device = device_;
+  device = device_;
 }
 
 OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
 {
-	program_split.release();
-	program_lamp_emission.release();
-	program_do_volume.release();
-	program_indirect_background.release();
-	program_shader_eval.release();
-	program_holdout_emission_blurring_pathtermination_ao.release();
-	program_subsurface_scatter.release();
-	program_direct_lighting.release();
-	program_shadow_blocked_ao.release();
-	program_shadow_blocked_dl.release();
+  program_split.release();
+  program_lamp_emission.release();
+  program_do_volume.release();
+  program_indirect_background.release();
+  program_shader_eval.release();
+  program_holdout_emission_blurring_pathtermination_ao.release();
+  program_subsurface_scatter.release();
+  program_direct_lighting.release();
+  program_shadow_blocked_ao.release();
+  program_shadow_blocked_dl.release();
 }
 
-void OpenCLDevice::OpenCLSplitPrograms::load_kernels(vector<OpenCLProgram*> &programs, const DeviceRequestedFeatures& requested_features, bool is_preview)
+void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
+    vector<OpenCLProgram *> &programs,
+    const DeviceRequestedFeatures &requested_features,
+    bool is_preview)
 {
-	if (!requested_features.use_baking) {
-#define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name));
-#define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
-		const string program_name_##kernel_name = "split_"#kernel_name; \
-		program_##kernel_name = \
-			OpenCLDevice::OpenCLProgram(device, \
-			                            program_name_##kernel_name, \
-			                            "kernel_"#kernel_name".cl", \
-			                            device->get_build_options(requested_features, program_name_##kernel_name, is_preview)); \
-		program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \
-		programs.push_back(&program_##kernel_name);
-
-		/* Ordered with most complex kernels first, to reduce overall compile time. */
-		ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
-		if (requested_features.use_volume || is_preview) {
-			ADD_SPLIT_KERNEL_PROGRAM(do_volume);
-		}
-		ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
-		ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
-		ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
-		ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
-		ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
-		ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
-		ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
-
-		/* Quick kernels bundled in a single program to reduce overhead of starting
-			* Blender processes. */
-		program_split = OpenCLDevice::OpenCLProgram(device,
-		                                            "split_bundle" ,
-		                                            "kernel_split_bundle.cl",
-		                                            device->get_build_options(requested_features, "split_bundle", is_preview));
-
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
-		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
-		programs.push_back(&program_split);
-
-#undef ADD_SPLIT_KERNEL_PROGRAM
-#undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
-	}
+  if (!requested_features.use_baking) {
+#  define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
+    program_split.add_kernel(ustring("path_trace_" #kernel_name));
+#  define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
+    const string program_name_##kernel_name = "split_" #kernel_name; \
+    program_##kernel_name = OpenCLDevice::OpenCLProgram( \
+        device, \
+        program_name_##kernel_name, \
+        "kernel_" #kernel_name ".cl", \
+        device->get_build_options(requested_features, program_name_##kernel_name, is_preview)); \
+    program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
+    programs.push_back(&program_##kernel_name);
+
+    /* Ordered with most complex kernels first, to reduce overall compile time. */
+    ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
+    if (requested_features.use_volume || is_preview) {
+      ADD_SPLIT_KERNEL_PROGRAM(do_volume);
+    }
+    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
+    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
+    ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
+    ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
+    ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
+    ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
+    ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
+
+    /* Quick kernels bundled in a single program to reduce overhead of starting
+      * Blender processes. */
+    program_split = OpenCLDevice::OpenCLProgram(
+        device,
+        "split_bundle",
+        "kernel_split_bundle.cl",
+        device->get_build_options(requested_features, "split_bundle", is_preview));
+
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
+    programs.push_back(&program_split);
+
+#  undef ADD_SPLIT_KERNEL_PROGRAM
+#  undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
+  }
 }
 
 namespace {
@@ -291,1126 +296,1108 @@ namespace {
  * fetch its size.
  */
 typedef struct KernelGlobalsDummy {
-	ccl_constant KernelData *data;
-	ccl_global char *buffers[8];
+  ccl_constant KernelData *data;
+  ccl_global char *buffers[8];
 
-#define KERNEL_TEX(type, name) \
-	TextureInfo name;
+#  define KERNEL_TEX(type, name) TextureInfo name;
 #  include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
-	SplitData split_data;
-	SplitParams split_param_data;
+#  undef KERNEL_TEX
+  SplitData split_data;
+  SplitParams split_param_data;
 } KernelGlobalsDummy;
 
 }  // namespace
 
-
 struct CachedSplitMemory {
-	int id;
-	device_memory *split_data;
-	device_memory *ray_state;
-	device_memory *queue_index;
-	device_memory *use_queues_flag;
-	device_memory *work_pools;
-	device_ptr *buffer;
+  int id;
+  device_memory *split_data;
+  device_memory *ray_state;
+  device_memory *queue_index;
+  device_memory *use_queues_flag;
+  device_memory *work_pools;
+  device_ptr *buffer;
 };
 
 class OpenCLSplitKernelFunction : public SplitKernelFunction {
-public:
-	OpenCLDevice* device;
-	OpenCLDevice::OpenCLProgram program;
-	CachedSplitMemory& cached_memory;
-	int cached_id;
-
-	OpenCLSplitKernelFunction(OpenCLDevice* device, CachedSplitMemory& cached_memory) :
-			device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
-	{
-	}
-
-	~OpenCLSplitKernelFunction()
-	{
-		program.release();
-	}
-
-	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
-	{
-		if(cached_id != cached_memory.id) {
-			cl_uint start_arg_index =
-				device->kernel_set_args(program(),
-				                        0,
-				                        kg,
-				                        data,
-				                        *cached_memory.split_data,
-				                        *cached_memory.ray_state);
-
-				device->set_kernel_arg_buffers(program(), &start_arg_index);
-
-			start_arg_index +=
-				device->kernel_set_args(program(),
-				                        start_arg_index,
-				                        *cached_memory.queue_index,
-				                        *cached_memory.use_queues_flag,
-				                        *cached_memory.work_pools,
-				                        *cached_memory.buffer);
-
-			cached_id = cached_memory.id;
-		}
-
-		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-		                                       program(),
-		                                       2,
-		                                       NULL,
-		                                       dim.global_size,
-		                                       dim.local_size,
-		                                       0,
-		                                       NULL,
-		                                       NULL);
-
-		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-		if(device->ciErr != CL_SUCCESS) {
-			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-			                               clewErrorString(device->ciErr));
-			device->opencl_error(message);
-			return false;
-		}
-
-		return true;
-	}
+ public:
+  OpenCLDevice *device;
+  OpenCLDevice::OpenCLProgram program;
+  CachedSplitMemory &cached_memory;
+  int cached_id;
+
+  OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
+      : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
+  {
+  }
+
+  ~OpenCLSplitKernelFunction()
+  {
+    program.release();
+  }
+
+  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
+  {
+    if (cached_id != cached_memory.id) {
+      cl_uint start_arg_index = device->kernel_set_args(
+          program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
+
+      device->set_kernel_arg_buffers(program(), &start_arg_index);
+
+      start_arg_index += device->kernel_set_args(program(),
+                                                 start_arg_index,
+                                                 *cached_memory.queue_index,
+                                                 *cached_memory.use_queues_flag,
+                                                 *cached_memory.work_pools,
+                                                 *cached_memory.buffer);
+
+      cached_id = cached_memory.id;
+    }
+
+    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+                                           program(),
+                                           2,
+                                           NULL,
+                                           dim.global_size,
+                                           dim.local_size,
+                                           0,
+                                           NULL,
+                                           NULL);
+
+    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+    if (device->ciErr != CL_SUCCESS) {
+      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+                                     clewErrorString(device->ciErr));
+      device->opencl_error(message);
+      return false;
+    }
+
+    return true;
+  }
 };
 
 class OpenCLSplitKernel : public DeviceSplitKernel {
-	OpenCLDevice *device;
-	CachedSplitMemory cached_memory;
-public:
-	explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) {
-	}
-
-	virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
-	                                                       const DeviceRequestedFeatures& requested_features)
-	{
-		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
-
-		const string program_name = device->get_opencl_program_name(kernel_name);
-		kernel->program =
-			OpenCLDevice::OpenCLProgram(device,
-			                            program_name,
-			                            device->get_opencl_program_filename(kernel_name),
-			                            device->get_build_options(requested_features,
-			                                                      program_name,
-			                                                      device->use_preview_kernels));
-
-		kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
-		kernel->program.load();
-
-		if(!kernel->program.is_loaded()) {
-			delete kernel;
-			return NULL;
-		}
-
-		return kernel;
-	}
-
-	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
-	{
-		device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-		size_buffer.alloc(1);
-		size_buffer.zero_to_device();
-
-		uint threads = num_threads;
-		OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-		cl_kernel kernel_state_buffer_size = programs->program_split(ustring("path_trace_state_buffer_size"));
-		device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
-
-		size_t global_size = 64;
-		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-		                                       kernel_state_buffer_size,
-		                                       1,
-		                                       NULL,
-		                                       &global_size,
-		                                       NULL,
-		                                       0,
-		                                       NULL,
-		                                       NULL);
-
-		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-		size_buffer.copy_from_device(0, 1, 1);
-		size_t size = size_buffer[0];
-		size_buffer.free();
-
-		if(device->ciErr != CL_SUCCESS) {
-			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-			                               clewErrorString(device->ciErr));
-			device->opencl_error(message);
-			return 0;
-		}
-
-		return size;
-	}
-
-	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
-	                                            RenderTile& rtile,
-	                                            int num_global_elements,
-	                                            device_memory& kernel_globals,
-	                                            device_memory& kernel_data,
-	                                            device_memory& split_data,
-	                                            device_memory& ray_state,
-	                                            device_memory& queue_index,
-	                                            device_memory& use_queues_flag,
-	                                            device_memory& work_pool_wgs
-	                                            )
-	{
-		cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
-
-		/* Set the range of samples to be processed for every ray in
-		 * path-regeneration logic.
-		 */
-		cl_int start_sample = rtile.start_sample;
-		cl_int end_sample = rtile.start_sample + rtile.num_samples;
-
-		OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-		cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
-
-		cl_uint start_arg_index =
-			device->kernel_set_args(kernel_data_init,
-			                        0,
-			                        kernel_globals,
-			                        kernel_data,
-			                        split_data,
-			                        num_global_elements,
-			                        ray_state);
-
-			device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
-
-		start_arg_index +=
-			device->kernel_set_args(kernel_data_init,
-			                        start_arg_index,
-			                        start_sample,
-			                        end_sample,
-			                        rtile.x,
-			                        rtile.y,
-			                        rtile.w,
-			                        rtile.h,
-			                        rtile.offset,
-			                        rtile.stride,
-			                        queue_index,
-			                        dQueue_size,
-			                        use_queues_flag,
-			                        work_pool_wgs,
-			                        rtile.num_samples,
-			                        rtile.buffer);
-
-		/* Enqueue ckPathTraceKernel_data_init kernel. */
-		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-		                                       kernel_data_init,
-		                                       2,
-		                                       NULL,
-		                                       dim.global_size,
-		                                       dim.local_size,
-		                                       0,
-		                                       NULL,
-		                                       NULL);
-
-		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-		if(device->ciErr != CL_SUCCESS) {
-			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-			                               clewErrorString(device->ciErr));
-			device->opencl_error(message);
-			return false;
-		}
-
-		cached_memory.split_data = &split_data;
-		cached_memory.ray_state = &ray_state;
-		cached_memory.queue_index = &queue_index;
-		cached_memory.use_queues_flag = &use_queues_flag;
-		cached_memory.work_pools = &work_pool_wgs;
-		cached_memory.buffer = &rtile.buffer;
-		cached_memory.id++;
-
-		return true;
-	}
-
-	virtual int2 split_kernel_local_size()
-	{
-		return make_int2(64, 1);
-	}
-
-	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
-	{
-		cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
-		/* Use small global size on CPU devices as it seems to be much faster. */
-		if(type == CL_DEVICE_TYPE_CPU) {
-			VLOG(1) << "Global size: (64, 64).";
-			return make_int2(64, 64);
-		}
-
-		cl_ulong max_buffer_size;
-		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-		if(DebugFlags().opencl.mem_limit) {
-			max_buffer_size = min(max_buffer_size,
-			                      cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
-		}
-
-		VLOG(1) << "Maximum device allocation size: "
-		        << string_human_readable_number(max_buffer_size) << " bytes. ("
-		        << string_human_readable_size(max_buffer_size) << ").";
-
-		/* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
-		max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024);
-
-		size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
-		int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
-		VLOG(1) << "Global size: " << global_size << ".";
-		return global_size;
-	}
+  OpenCLDevice *device;
+  CachedSplitMemory cached_memory;
+
+ public:
+  explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
+  {
+  }
+
+  virtual SplitKernelFunction *get_split_kernel_function(
+      const string &kernel_name, const DeviceRequestedFeatures &requested_features)
+  {
+    OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
+
+    const string program_name = device->get_opencl_program_name(kernel_name);
+    kernel->program = OpenCLDevice::OpenCLProgram(
+        device,
+        program_name,
+        device->get_opencl_program_filename(kernel_name),
+        device->get_build_options(requested_features, program_name, device->use_preview_kernels));
+
+    kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
+    kernel->program.load();
+
+    if (!kernel->program.is_loaded()) {
+      delete kernel;
+      return NULL;
+    }
+
+    return kernel;
+  }
+
+  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
+  {
+    device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
+    size_buffer.alloc(1);
+    size_buffer.zero_to_device();
+
+    uint threads = num_threads;
+    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
+    cl_kernel kernel_state_buffer_size = programs->program_split(
+        ustring("path_trace_state_buffer_size"));
+    device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
+
+    size_t global_size = 64;
+    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+                                           kernel_state_buffer_size,
+                                           1,
+                                           NULL,
+                                           &global_size,
+                                           NULL,
+                                           0,
+                                           NULL,
+                                           NULL);
+
+    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+    size_buffer.copy_from_device(0, 1, 1);
+    size_t size = size_buffer[0];
+    size_buffer.free();
+
+    if (device->ciErr != CL_SUCCESS) {
+      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+                                     clewErrorString(device->ciErr));
+      device->opencl_error(message);
+      return 0;
+    }
+
+    return size;
+  }
+
+  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                              RenderTile &rtile,
+                                              int num_global_elements,
+                                              device_memory &kernel_globals,
+                                              device_memory &kernel_data,
+                                              device_memory &split_data,
+                                              device_memory &ray_state,
+                                              device_memory &queue_index,
+                                              device_memory &use_queues_flag,
+                                              device_memory &work_pool_wgs)
+  {
+    cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
+
+    /* Set the range of samples to be processed for every ray in
+     * path-regeneration logic.
+     */
+    cl_int start_sample = rtile.start_sample;
+    cl_int end_sample = rtile.start_sample + rtile.num_samples;
+
+    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
+    cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
+
+    cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
+                                                      0,
+                                                      kernel_globals,
+                                                      kernel_data,
+                                                      split_data,
+                                                      num_global_elements,
+                                                      ray_state);
+
+    device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
+
+    start_arg_index += device->kernel_set_args(kernel_data_init,
+                                               start_arg_index,
+                                               start_sample,
+                                               end_sample,
+                                               rtile.x,
+                                               rtile.y,
+                                               rtile.w,
+                                               rtile.h,
+                                               rtile.offset,
+                                               rtile.stride,
+                                               queue_index,
+                                               dQueue_size,
+                                               use_queues_flag,
+                                               work_pool_wgs,
+                                               rtile.num_samples,
+                                               rtile.buffer);
+
+    /* Enqueue ckPathTraceKernel_data_init kernel. */
+    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+                                           kernel_data_init,
+                                           2,
+                                           NULL,
+                                           dim.global_size,
+                                           dim.local_size,
+                                           0,
+                                           NULL,
+                                           NULL);
+
+    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+    if (device->ciErr != CL_SUCCESS) {
+      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+                                     clewErrorString(device->ciErr));
+      device->opencl_error(message);
+      return false;
+    }
+
+    cached_memory.split_data = &split_data;
+    cached_memory.ray_state = &ray_state;
+    cached_memory.queue_index = &queue_index;
+    cached_memory.use_queues_flag = &use_queues_flag;
+    cached_memory.work_pools = &work_pool_wgs;
+    cached_memory.buffer = &rtile.buffer;
+    cached_memory.id++;
+
+    return true;
+  }
+
+  virtual int2 split_kernel_local_size()
+  {
+    return make_int2(64, 1);
+  }
+
+  virtual int2 split_kernel_global_size(device_memory &kg,
+                                        device_memory &data,
+                                        DeviceTask * /*task*/)
+  {
+    cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
+    /* Use small global size on CPU devices as it seems to be much faster. */
+    if (type == CL_DEVICE_TYPE_CPU) {
+      VLOG(1) << "Global size: (64, 64).";
+      return make_int2(64, 64);
+    }
+
+    cl_ulong max_buffer_size;
+    clGetDeviceInfo(
+        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+    if (DebugFlags().opencl.mem_limit) {
+      max_buffer_size = min(max_buffer_size,
+                            cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
+    }
+
+    VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
+            << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
+
+    /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
+    max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
+
+    size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
+    int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
+                                 (int)sqrt(num_elements));
+    VLOG(1) << "Global size: " << global_size << ".";
+    return global_size;
+  }
 };
 
 bool OpenCLDevice::opencl_error(cl_int err)
 {
-	if(err != CL_SUCCESS) {
-		string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
-		if(error_msg == "")
-			error_msg = message;
-		fprintf(stderr, "%s\n", message.c_str());
-		return true;
-	}
-
-	return false;
+  if (err != CL_SUCCESS) {
+    string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
+    if (error_msg == "")
+      error_msg = message;
+    fprintf(stderr, "%s\n", message.c_str());
+    return true;
+  }
+
+  return false;
 }
 
-void OpenCLDevice::opencl_error(const string& message)
+void OpenCLDevice::opencl_error(const string &message)
 {
-	if(error_msg == "")
-		error_msg = message;
-	fprintf(stderr, "%s\n", message.c_str());
+  if (error_msg == "")
+    error_msg = message;
+  fprintf(stderr, "%s\n", message.c_str());
 }
 
-void OpenCLDevice::opencl_assert_err(cl_int err, const char* where)
+void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
 {
-	if(err != CL_SUCCESS) {
-		string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
-		if(error_msg == "")
-			error_msg = message;
-		fprintf(stderr, "%s\n", message.c_str());
-#ifndef NDEBUG
-		abort();
-#endif
-	}
+  if (err != CL_SUCCESS) {
+    string message = string_printf(
+        "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
+    if (error_msg == "")
+      error_msg = message;
+    fprintf(stderr, "%s\n", message.c_str());
+#  ifndef NDEBUG
+    abort();
+#  endif
+  }
 }
 
-OpenCLDevice::OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
-: Device(info, stats, profiler, background),
-  kernel_programs(this),
-  preview_programs(this),
-  memory_manager(this),
-  texture_info(this, "__texture_info", MEM_TEXTURE)
+OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+    : Device(info, stats, profiler, background),
+      kernel_programs(this),
+      preview_programs(this),
+      memory_manager(this),
+      texture_info(this, "__texture_info", MEM_TEXTURE)
 {
-	cpPlatform = NULL;
-	cdDevice = NULL;
-	cxContext = NULL;
-	cqCommandQueue = NULL;
-	null_mem = 0;
-	device_initialized = false;
-	textures_need_update = true;
-	use_preview_kernels = !background;
-
-	vector<OpenCLPlatformDevice> usable_devices;
-	OpenCLInfo::get_usable_devices(&usable_devices);
-	if(usable_devices.size() == 0) {
-		opencl_error("OpenCL: no devices found.");
-		return;
-	}
-	assert(info.num < usable_devices.size());
-	OpenCLPlatformDevice& platform_device = usable_devices[info.num];
-	device_num = info.num;
-	cpPlatform = platform_device.platform_id;
-	cdDevice = platform_device.device_id;
-	platform_name = platform_device.platform_name;
-	device_name = platform_device.device_name;
-	VLOG(2) << "Creating new Cycles device for OpenCL platform "
-	        << platform_name << ", device "
-	        << device_name << ".";
-
-	{
-		/* try to use cached context */
-		thread_scoped_lock cache_locker;
-		cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
-
-		if(cxContext == NULL) {
-			/* create context properties array to specify platform */
-			const cl_context_properties context_props[] = {
-				CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
-				0, 0
-			};
-
-			/* create context */
-			cxContext = clCreateContext(context_props, 1, &cdDevice,
-				context_notify_callback, cdDevice, &ciErr);
-
-			if(opencl_error(ciErr)) {
-				opencl_error("OpenCL: clCreateContext failed");
-				return;
-			}
-
-			/* cache it */
-			OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
-		}
-	}
-
-	cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-	if(opencl_error(ciErr)) {
-		opencl_error("OpenCL: Error creating command queue");
-		return;
-	}
-
-	null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
-	if(opencl_error(ciErr)) {
-		opencl_error("OpenCL: Error creating memory buffer for NULL");
-		return;
-	}
-
-	/* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
-	texture_info.resize(1);
-	memory_manager.alloc("texture_info", texture_info);
-
-	device_initialized = true;
-
-	split_kernel = new OpenCLSplitKernel(this);
-	if (!background) {
-		load_preview_kernels();
-	}
+  cpPlatform = NULL;
+  cdDevice = NULL;
+  cxContext = NULL;
+  cqCommandQueue = NULL;
+  null_mem = 0;
+  device_initialized = false;
+  textures_need_update = true;
+  use_preview_kernels = !background;
+
+  vector<OpenCLPlatformDevice> usable_devices;
+  OpenCLInfo::get_usable_devices(&usable_devices);
+  if (usable_devices.size() == 0) {
+    opencl_error("OpenCL: no devices found.");
+    return;
+  }
+  assert(info.num < usable_devices.size());
+  OpenCLPlatformDevice &platform_device = usable_devices[info.num];
+  device_num = info.num;
+  cpPlatform = platform_device.platform_id;
+  cdDevice = platform_device.device_id;
+  platform_name = platform_device.platform_name;
+  device_name = platform_device.device_name;
+  VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
+          << device_name << ".";
+
+  {
+    /* try to use cached context */
+    thread_scoped_lock cache_locker;
+    cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
+
+    if (cxContext == NULL) {
+      /* create context properties array to specify platform */
+      const cl_context_properties context_props[] = {
+          CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
+
+      /* create context */
+      cxContext = clCreateContext(
+          context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
+
+      if (opencl_error(ciErr)) {
+        opencl_error("OpenCL: clCreateContext failed");
+        return;
+      }
+
+      /* cache it */
+      OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
+    }
+  }
+
+  cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
+  if (opencl_error(ciErr)) {
+    opencl_error("OpenCL: Error creating command queue");
+    return;
+  }
+
+  null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
+  if (opencl_error(ciErr)) {
+    opencl_error("OpenCL: Error creating memory buffer for NULL");
+    return;
+  }
+
+  /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
+  texture_info.resize(1);
+  memory_manager.alloc("texture_info", texture_info);
+
+  device_initialized = true;
+
+  split_kernel = new OpenCLSplitKernel(this);
+  if (!background) {
+    load_preview_kernels();
+  }
 }
 
 OpenCLDevice::~OpenCLDevice()
 {
-	task_pool.stop();
-	load_required_kernel_task_pool.stop();
-	load_kernel_task_pool.stop();
+  task_pool.stop();
+  load_required_kernel_task_pool.stop();
+  load_kernel_task_pool.stop();
 
-	memory_manager.free();
+  memory_manager.free();
 
-	if(null_mem)
-		clReleaseMemObject(CL_MEM_PTR(null_mem));
+  if (null_mem)
+    clReleaseMemObject(CL_MEM_PTR(null_mem));
 
-	ConstMemMap::iterator mt;
-	for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
-		delete mt->second;
-	}
+  ConstMemMap::iterator mt;
+  for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
+    delete mt->second;
+  }
 
-	base_program.release();
-	bake_program.release();
-	displace_program.release();
-	background_program.release();
-	denoising_program.release();
+  base_program.release();
+  bake_program.release();
+  displace_program.release();
+  background_program.release();
+  denoising_program.release();
 
-	if(cqCommandQueue)
-		clReleaseCommandQueue(cqCommandQueue);
-	if(cxContext)
-		clReleaseContext(cxContext);
+  if (cqCommandQueue)
+    clReleaseCommandQueue(cqCommandQueue);
+  if (cxContext)
+    clReleaseContext(cxContext);
 
-	delete split_kernel;
+  delete split_kernel;
 }
 
 void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
-	const void * /*private_info*/, size_t /*cb*/, void *user_data)
+                                                       const void * /*private_info*/,
+                                                       size_t /*cb*/,
+                                                       void *user_data)
 {
-	string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
-	fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
+  string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
+  fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
 }
 
 bool OpenCLDevice::opencl_version_check()
 {
-	string error;
-	if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
-		opencl_error(error);
-		return false;
-	}
-	if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
-		opencl_error(error);
-		return false;
-	}
-	return true;
+  string error;
+  if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
+    opencl_error(error);
+    return false;
+  }
+  if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
+    opencl_error(error);
+    return false;
+  }
+  return true;
 }
 
 string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
 {
-	MD5Hash md5;
-	char version[256], driver[256], name[256], vendor[256];
+  MD5Hash md5;
+  char version[256], driver[256], name[256], vendor[256];
 
-	clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
-	clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-	clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-	clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
+  clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
+  clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
+  clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
+  clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
 
-	md5.append((uint8_t*)vendor, strlen(vendor));
-	md5.append((uint8_t*)version, strlen(version));
-	md5.append((uint8_t*)name, strlen(name));
-	md5.append((uint8_t*)driver, strlen(driver));
+  md5.append((uint8_t *)vendor, strlen(vendor));
+  md5.append((uint8_t *)version, strlen(version));
+  md5.append((uint8_t *)name, strlen(name));
+  md5.append((uint8_t *)driver, strlen(driver));
 
-	string options = kernel_build_options();
-	options += kernel_custom_build_options;
-	md5.append((uint8_t*)options.c_str(), options.size());
+  string options = kernel_build_options();
+  options += kernel_custom_build_options;
+  md5.append((uint8_t *)options.c_str(), options.size());
 
-	return md5.get_hex();
+  return md5.get_hex();
 }
 
-bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures& requested_features)
+bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
 {
-	VLOG(2) << "Loading kernels for platform " << platform_name
-	        << ", device " << device_name << ".";
-	/* Verify if device was initialized. */
-	if(!device_initialized) {
-		fprintf(stderr, "OpenCL: failed to initialize device.\n");
-		return false;
-	}
-
-	/* Verify we have right opencl version. */
-	if(!opencl_version_check())
-		return false;
-
-	load_required_kernels(requested_features);
-
-	vector<OpenCLProgram*> programs;
-	kernel_programs.load_kernels(programs, requested_features, false);
-
-	if (!requested_features.use_baking && requested_features.use_denoising) {
-		denoising_program = OpenCLProgram(this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
-		denoising_program.add_kernel(ustring("filter_divide_shadow"));
-		denoising_program.add_kernel(ustring("filter_get_feature"));
-		denoising_program.add_kernel(ustring("filter_write_feature"));
-		denoising_program.add_kernel(ustring("filter_detect_outliers"));
-		denoising_program.add_kernel(ustring("filter_combine_halves"));
-		denoising_program.add_kernel(ustring("filter_construct_transform"));
-		denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
-		denoising_program.add_kernel(ustring("filter_nlm_blur"));
-		denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
-		denoising_program.add_kernel(ustring("filter_nlm_update_output"));
-		denoising_program.add_kernel(ustring("filter_nlm_normalize"));
-		denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
-		denoising_program.add_kernel(ustring("filter_finalize"));
-		programs.push_back(&denoising_program);
-	}
-
-	load_required_kernel_task_pool.wait_work();
-
-	/* Parallel compilation of Cycles kernels, this launches multiple
-	 * processes to workaround OpenCL frameworks serializing the calls
-	 * internally within a single process. */
-	foreach(OpenCLProgram *program, programs) {
-		if (!program->load()) {
-			load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
-		}
-	}
-	return true;
+  VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
+  /* Verify if device was initialized. */
+  if (!device_initialized) {
+    fprintf(stderr, "OpenCL: failed to initialize device.\n");
+    return false;
+  }
+
+  /* Verify we have right opencl version. */
+  if (!opencl_version_check())
+    return false;
+
+  load_required_kernels(requested_features);
+
+  vector<OpenCLProgram *> programs;
+  kernel_programs.load_kernels(programs, requested_features, false);
+
+  if (!requested_features.use_baking && requested_features.use_denoising) {
+    denoising_program = OpenCLProgram(
+        this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
+    denoising_program.add_kernel(ustring("filter_divide_shadow"));
+    denoising_program.add_kernel(ustring("filter_get_feature"));
+    denoising_program.add_kernel(ustring("filter_write_feature"));
+    denoising_program.add_kernel(ustring("filter_detect_outliers"));
+    denoising_program.add_kernel(ustring("filter_combine_halves"));
+    denoising_program.add_kernel(ustring("filter_construct_transform"));
+    denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+    denoising_program.add_kernel(ustring("filter_nlm_blur"));
+    denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+    denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+    denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+    denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+    denoising_program.add_kernel(ustring("filter_finalize"));
+    programs.push_back(&denoising_program);
+  }
+
+  load_required_kernel_task_pool.wait_work();
+
+  /* Parallel compilation of Cycles kernels, this launches multiple
+   * processes to workaround OpenCL frameworks serializing the calls
+   * internally within a single process. */
+  foreach (OpenCLProgram *program, programs) {
+    if (!program->load()) {
+      load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+    }
+  }
+  return true;
 }
 
-void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures& requested_features)
+void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
 {
-	vector<OpenCLProgram*> programs;
-	base_program = OpenCLProgram(this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
-	base_program.add_kernel(ustring("convert_to_byte"));
-	base_program.add_kernel(ustring("convert_to_half_float"));
-	base_program.add_kernel(ustring("zero_buffer"));
-	programs.push_back(&base_program);
-
-	if (requested_features.use_true_displacement) {
-		displace_program = OpenCLProgram(this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
-		displace_program.add_kernel(ustring("displace"));
-		programs.push_back(&displace_program);
-	}
-
-	if (requested_features.use_background_light) {
-		background_program = OpenCLProgram(this, "background", "kernel_background.cl", get_build_options(requested_features, "background"));
-		background_program.add_kernel(ustring("background"));
-		programs.push_back(&background_program);
-	}
-
-	if (requested_features.use_baking) {
-		bake_program = OpenCLProgram(this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
-		bake_program.add_kernel(ustring("bake"));
-		programs.push_back(&bake_program);
-	}
-
-	foreach(OpenCLProgram *program, programs) {
-		if (!program->load()) {
-			load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
-		}
-	}
+  vector<OpenCLProgram *> programs;
+  base_program = OpenCLProgram(
+      this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
+  base_program.add_kernel(ustring("convert_to_byte"));
+  base_program.add_kernel(ustring("convert_to_half_float"));
+  base_program.add_kernel(ustring("zero_buffer"));
+  programs.push_back(&base_program);
+
+  if (requested_features.use_true_displacement) {
+    displace_program = OpenCLProgram(
+        this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
+    displace_program.add_kernel(ustring("displace"));
+    programs.push_back(&displace_program);
+  }
+
+  if (requested_features.use_background_light) {
+    background_program = OpenCLProgram(this,
+                                       "background",
+                                       "kernel_background.cl",
+                                       get_build_options(requested_features, "background"));
+    background_program.add_kernel(ustring("background"));
+    programs.push_back(&background_program);
+  }
+
+  if (requested_features.use_baking) {
+    bake_program = OpenCLProgram(
+        this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
+    bake_program.add_kernel(ustring("bake"));
+    programs.push_back(&bake_program);
+  }
+
+  foreach (OpenCLProgram *program, programs) {
+    if (!program->load()) {
+      load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+    }
+  }
 }
 
 void OpenCLDevice::load_preview_kernels()
 {
-	DeviceRequestedFeatures no_features;
-	vector<OpenCLProgram*> programs;
-	preview_programs.load_kernels(programs, no_features, true);
-
-	foreach(OpenCLProgram *program, programs) {
-		if (!program->load()) {
-			load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
-		}
-	}
+  DeviceRequestedFeatures no_features;
+  vector<OpenCLProgram *> programs;
+  preview_programs.load_kernels(programs, no_features, true);
+
+  foreach (OpenCLProgram *program, programs) {
+    if (!program->load()) {
+      load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+    }
+  }
 }
 
-bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures& requested_features)
+bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
 {
-	if (background) {
-		load_kernel_task_pool.wait_work();
-		use_preview_kernels = false;
-	}
-	else {
-		/* We use a device setting to determine to load preview kernels or not
-		 * Better to check on device level than per kernel as mixing preview and
-		 * non-preview kernels does not work due to different data types */
-		if (use_preview_kernels) {
-			use_preview_kernels = !load_kernel_task_pool.finished();
-		}
-	}
-	return split_kernel->load_kernels(requested_features);
+  if (background) {
+    load_kernel_task_pool.wait_work();
+    use_preview_kernels = false;
+  }
+  else {
+    /* We use a device setting to determine to load preview kernels or not
+     * Better to check on device level than per kernel as mixing preview and
+     * non-preview kernels does not work due to different data types */
+    if (use_preview_kernels) {
+      use_preview_kernels = !load_kernel_task_pool.finished();
+    }
+  }
+  return split_kernel->load_kernels(requested_features);
 }
 
-OpenCLDevice::OpenCLSplitPrograms* OpenCLDevice::get_split_programs()
+OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
 {
-	return use_preview_kernels?&preview_programs:&kernel_programs;
+  return use_preview_kernels ? &preview_programs : &kernel_programs;
 }
 
 DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
 {
-	/* Do not switch kernels for background renderings
-	 * We do foreground rendering but use the preview kernels
-	 * Check for the optimized kernels
-	 *
-	 * This works also the other way around, where we are using
-	 * optimized kernels but new ones are being compiled due
-	 * to other features that are needed */
-	if (background) {
-		/* The if-statements below would find the same result,
-		 * But as the `finished` method uses a mutex we added
-		 * this as an early exit */
-		return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-	}
-
-	bool other_kernels_finished = load_kernel_task_pool.finished();
-	if (use_preview_kernels) {
-		if (other_kernels_finished) {
-			return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE;
-		}
-		else {
-			return DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL;
-		}
-	}
-	else {
-		if (other_kernels_finished) {
-			return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-		}
-		else {
-			return DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
-		}
-	}
+  /* Do not switch kernels for background renderings
+   * We do foreground rendering but use the preview kernels
+   * Check for the optimized kernels
+   *
+   * This works also the other way around, where we are using
+   * optimized kernels but new ones are being compiled due
+   * to other features that are needed */
+  if (background) {
+    /* The if-statements below would find the same result,
+     * But as the `finished` method uses a mutex we added
+     * this as an early exit */
+    return DEVICE_KERNEL_USING_FEATURE_KERNEL;
+  }
+
+  bool other_kernels_finished = load_kernel_task_pool.finished();
+  if (use_preview_kernels) {
+    if (other_kernels_finished) {
+      return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE;
+    }
+    else {
+      return DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL;
+    }
+  }
+  else {
+    if (other_kernels_finished) {
+      return DEVICE_KERNEL_USING_FEATURE_KERNEL;
+    }
+    else {
+      return DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
+    }
+  }
 }
 
-void OpenCLDevice::mem_alloc(device_memory& mem)
+void OpenCLDevice::mem_alloc(device_memory &mem)
 {
-	if(mem.name) {
-		VLOG(1) << "Buffer allocate: " << mem.name << ", "
-		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-		        << string_human_readable_size(mem.memory_size()) << ")";
-	}
-
-	size_t size = mem.memory_size();
-
-	/* check there is enough memory available for the allocation */
-	cl_ulong max_alloc_size = 0;
-	clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
-
-	if(DebugFlags().opencl.mem_limit) {
-		max_alloc_size = min(max_alloc_size,
-		                     cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
-	}
-
-	if(size > max_alloc_size) {
-		string error = "Scene too complex to fit in available memory.";
-		if(mem.name != NULL) {
-			error += string_printf(" (allocating buffer %s failed.)", mem.name);
-		}
-		set_error(error);
-
-		return;
-	}
-
-	cl_mem_flags mem_flag;
-	void *mem_ptr = NULL;
-
-	if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
-		mem_flag = CL_MEM_READ_ONLY;
-	else
-		mem_flag = CL_MEM_READ_WRITE;
-
-	/* Zero-size allocation might be invoked by render, but not really
-	 * supported by OpenCL. Using NULL as device pointer also doesn't really
-	 * work for some reason, so for the time being we'll use special case
-	 * will null_mem buffer.
-	 */
-	if(size != 0) {
-		mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
-		                                                mem_flag,
-		                                                size,
-		                                                mem_ptr,
-		                                                &ciErr);
-		opencl_assert_err(ciErr, "clCreateBuffer");
-	}
-	else {
-		mem.device_pointer = null_mem;
-	}
-
-	stats.mem_alloc(size);
-	mem.device_size = size;
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+  }
+
+  size_t size = mem.memory_size();
+
+  /* check there is enough memory available for the allocation */
+  cl_ulong max_alloc_size = 0;
+  clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
+
+  if (DebugFlags().opencl.mem_limit) {
+    max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
+  }
+
+  if (size > max_alloc_size) {
+    string error = "Scene too complex to fit in available memory.";
+    if (mem.name != NULL) {
+      error += string_printf(" (allocating buffer %s failed.)", mem.name);
+    }
+    set_error(error);
+
+    return;
+  }
+
+  cl_mem_flags mem_flag;
+  void *mem_ptr = NULL;
+
+  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
+    mem_flag = CL_MEM_READ_ONLY;
+  else
+    mem_flag = CL_MEM_READ_WRITE;
+
+  /* Zero-size allocation might be invoked by render, but not really
+   * supported by OpenCL. Using NULL as device pointer also doesn't really
+   * work for some reason, so for the time being we'll use special case
+   * will null_mem buffer.
+   */
+  if (size != 0) {
+    mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
+    opencl_assert_err(ciErr, "clCreateBuffer");
+  }
+  else {
+    mem.device_pointer = null_mem;
+  }
+
+  stats.mem_alloc(size);
+  mem.device_size = size;
 }
 
-void OpenCLDevice::mem_copy_to(device_memory& mem)
+void OpenCLDevice::mem_copy_to(device_memory &mem)
 {
-	if(mem.type == MEM_TEXTURE) {
-		tex_free(mem);
-		tex_alloc(mem);
-	}
-	else {
-		if(!mem.device_pointer) {
-			mem_alloc(mem);
-		}
-
-		/* this is blocking */
-		size_t size = mem.memory_size();
-		if(size != 0) {
-			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-			                                   CL_MEM_PTR(mem.device_pointer),
-			                                   CL_TRUE,
-			                                   0,
-			                                   size,
-			                                   mem.host_pointer,
-			                                   0,
-			                                   NULL, NULL));
-		}
-	}
+  if (mem.type == MEM_TEXTURE) {
+    tex_free(mem);
+    tex_alloc(mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      mem_alloc(mem);
+    }
+
+    /* this is blocking */
+    size_t size = mem.memory_size();
+    if (size != 0) {
+      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+                                         CL_MEM_PTR(mem.device_pointer),
+                                         CL_TRUE,
+                                         0,
+                                         size,
+                                         mem.host_pointer,
+                                         0,
+                                         NULL,
+                                         NULL));
+    }
+  }
 }
 
-void OpenCLDevice::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
 {
-	size_t offset = elem*y*w;
-	size_t size = elem*w*h;
-	assert(size != 0);
-	opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
-	                                  CL_MEM_PTR(mem.device_pointer),
-	                                  CL_TRUE,
-	                                  offset,
-	                                  size,
-	                                  (uchar*)mem.host_pointer + offset,
-	                                  0,
-	                                  NULL, NULL));
+  size_t offset = elem * y * w;
+  size_t size = elem * w * h;
+  assert(size != 0);
+  opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
+                                    CL_MEM_PTR(mem.device_pointer),
+                                    CL_TRUE,
+                                    offset,
+                                    size,
+                                    (uchar *)mem.host_pointer + offset,
+                                    0,
+                                    NULL,
+                                    NULL));
 }
 
 void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
 {
-	base_program.wait_for_availability();
-	cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
-	size_t global_size[] = {1024, 1024};
-	size_t num_threads = global_size[0] * global_size[1];
-
-	cl_mem d_buffer = CL_MEM_PTR(mem);
-	cl_ulong d_offset = 0;
-	cl_ulong d_size = 0;
-
-	while(d_offset < size) {
-		d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
-
-		kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
-		ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
-		                               ckZeroBuffer,
-		                               2,
-		                               NULL,
-		                               global_size,
-		                               NULL,
-		                               0,
-		                               NULL,
-		                               NULL);
-		opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
-		d_offset += d_size;
-	}
+  base_program.wait_for_availability();
+  cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+
+  size_t global_size[] = {1024, 1024};
+  size_t num_threads = global_size[0] * global_size[1];
+
+  cl_mem d_buffer = CL_MEM_PTR(mem);
+  cl_ulong d_offset = 0;
+  cl_ulong d_size = 0;
+
+  while (d_offset < size) {
+    d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
+
+    kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+
+    ciErr = clEnqueueNDRangeKernel(
+        cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
+    opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+
+    d_offset += d_size;
+  }
 }
 
-void OpenCLDevice::mem_zero(device_memory& mem)
+void OpenCLDevice::mem_zero(device_memory &mem)
 {
-	if(!mem.device_pointer) {
-		mem_alloc(mem);
-	}
-
-	if(mem.device_pointer) {
-		if(base_program.is_loaded()) {
-			mem_zero_kernel(mem.device_pointer, mem.memory_size());
-		}
-
-		if(mem.host_pointer) {
-			memset(mem.host_pointer, 0, mem.memory_size());
-		}
-
-		if(!base_program.is_loaded()) {
-			void* zero = mem.host_pointer;
-
-			if(!mem.host_pointer) {
-				zero = util_aligned_malloc(mem.memory_size(), 16);
-				memset(zero, 0, mem.memory_size());
-			}
-
-			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-			                                   CL_MEM_PTR(mem.device_pointer),
-			                                   CL_TRUE,
-			                                   0,
-			                                   mem.memory_size(),
-			                                   zero,
-			                                   0,
-			                                   NULL, NULL));
-
-			if(!mem.host_pointer) {
-				util_aligned_free(zero);
-			}
-		}
-	}
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+
+  if (mem.device_pointer) {
+    if (base_program.is_loaded()) {
+      mem_zero_kernel(mem.device_pointer, mem.memory_size());
+    }
+
+    if (mem.host_pointer) {
+      memset(mem.host_pointer, 0, mem.memory_size());
+    }
+
+    if (!base_program.is_loaded()) {
+      void *zero = mem.host_pointer;
+
+      if (!mem.host_pointer) {
+        zero = util_aligned_malloc(mem.memory_size(), 16);
+        memset(zero, 0, mem.memory_size());
+      }
+
+      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+                                         CL_MEM_PTR(mem.device_pointer),
+                                         CL_TRUE,
+                                         0,
+                                         mem.memory_size(),
+                                         zero,
+                                         0,
+                                         NULL,
+                                         NULL));
+
+      if (!mem.host_pointer) {
+        util_aligned_free(zero);
+      }
+    }
+  }
 }
 
-void OpenCLDevice::mem_free(device_memory& mem)
+void OpenCLDevice::mem_free(device_memory &mem)
 {
-	if(mem.type == MEM_TEXTURE) {
-		tex_free(mem);
-	}
-	else {
-		if(mem.device_pointer) {
-			if(mem.device_pointer != null_mem) {
-				opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
-			}
-			mem.device_pointer = 0;
-
-			stats.mem_free(mem.device_size);
-			mem.device_size = 0;
-		}
-	}
+  if (mem.type == MEM_TEXTURE) {
+    tex_free(mem);
+  }
+  else {
+    if (mem.device_pointer) {
+      if (mem.device_pointer != null_mem) {
+        opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
+      }
+      mem.device_pointer = 0;
+
+      stats.mem_free(mem.device_size);
+      mem.device_size = 0;
+    }
+  }
 }
 
 int OpenCLDevice::mem_sub_ptr_alignment()
 {
-	return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
+  return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
 }
 
-device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory& mem, int offset, int size)
+device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
 {
-	cl_mem_flags mem_flag;
-	if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
-		mem_flag = CL_MEM_READ_ONLY;
-	else
-		mem_flag = CL_MEM_READ_WRITE;
-
-	cl_buffer_region info;
-	info.origin = mem.memory_elements_size(offset);
-	info.size = mem.memory_elements_size(size);
-
-	device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
-	                                                    mem_flag,
-	                                                    CL_BUFFER_CREATE_TYPE_REGION,
-	                                                    &info,
-	                                                    &ciErr);
-	opencl_assert_err(ciErr, "clCreateSubBuffer");
-	return sub_buf;
+  cl_mem_flags mem_flag;
+  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
+    mem_flag = CL_MEM_READ_ONLY;
+  else
+    mem_flag = CL_MEM_READ_WRITE;
+
+  cl_buffer_region info;
+  info.origin = mem.memory_elements_size(offset);
+  info.size = mem.memory_elements_size(size);
+
+  device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
+      CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
+  opencl_assert_err(ciErr, "clCreateSubBuffer");
+  return sub_buf;
 }
 
 void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
 {
-	if(device_pointer && device_pointer != null_mem) {
-		opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
-	}
+  if (device_pointer && device_pointer != null_mem) {
+    opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
+  }
 }
 
 void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
 {
-	ConstMemMap::iterator i = const_mem_map.find(name);
-	device_vector<uchar> *data;
-
-	if(i == const_mem_map.end()) {
-		data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
-		data->alloc(size);
-		const_mem_map.insert(ConstMemMap::value_type(name, data));
-	}
-	else {
-		data = i->second;
-	}
-
-	memcpy(data->data(), host, size);
-	data->copy_to_device();
+  ConstMemMap::iterator i = const_mem_map.find(name);
+  device_vector<uchar> *data;
+
+  if (i == const_mem_map.end()) {
+    data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
+    data->alloc(size);
+    const_mem_map.insert(ConstMemMap::value_type(name, data));
+  }
+  else {
+    data = i->second;
+  }
+
+  memcpy(data->data(), host, size);
+  data->copy_to_device();
 }
 
-void OpenCLDevice::tex_alloc(device_memory& mem)
+void OpenCLDevice::tex_alloc(device_memory &mem)
 {
-	VLOG(1) << "Texture allocate: " << mem.name << ", "
-	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-	        << string_human_readable_size(mem.memory_size()) << ")";
-
-	memory_manager.alloc(mem.name, mem);
-	/* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
-	mem.device_pointer = 1;
-	textures[mem.name] = &mem;
-	textures_need_update = true;
+  VLOG(1) << "Texture allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  memory_manager.alloc(mem.name, mem);
+  /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
+  mem.device_pointer = 1;
+  textures[mem.name] = &mem;
+  textures_need_update = true;
 }
 
-void OpenCLDevice::tex_free(device_memory& mem)
+void OpenCLDevice::tex_free(device_memory &mem)
 {
-	if(mem.device_pointer) {
-		mem.device_pointer = 0;
-
-		if(memory_manager.free(mem)) {
-			textures_need_update = true;
-		}
-
-		foreach(TexturesMap::value_type& value, textures) {
-			if(value.second == &mem) {
-				textures.erase(value.first);
-				break;
-			}
-		}
-	}
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+
+    if (memory_manager.free(mem)) {
+      textures_need_update = true;
+    }
+
+    foreach (TexturesMap::value_type &value, textures) {
+      if (value.second == &mem) {
+        textures.erase(value.first);
+        break;
+      }
+    }
+  }
 }
 
 size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
 {
-	int r = global_size % group_size;
-	return global_size + ((r == 0)? 0: group_size - r);
+  int r = global_size % group_size;
+  return global_size + ((r == 0) ? 0 : group_size - r);
 }
 
-void OpenCLDevice::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
+void OpenCLDevice::enqueue_kernel(
+    cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
 {
-	size_t workgroup_size, max_work_items[3];
-
-	clGetKernelWorkGroupInfo(kernel, cdDevice,
-		CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
-	clGetDeviceInfo(cdDevice,
-		CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
-
-	if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
-		workgroup_size = max_workgroup_size;
-	}
-
-	/* Try to divide evenly over 2 dimensions. */
-	size_t local_size[2];
-	if(x_workgroups) {
-		local_size[0] = workgroup_size;
-		local_size[1] = 1;
-	}
-	else {
-		size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-		local_size[0] = local_size[1] = sqrt_workgroup_size;
-	}
-
-	/* Some implementations have max size 1 on 2nd dimension. */
-	if(local_size[1] > max_work_items[1]) {
-		local_size[0] = workgroup_size/max_work_items[1];
-		local_size[1] = max_work_items[1];
-	}
-
-	size_t global_size[2] = {global_size_round_up(local_size[0], w),
-	                         global_size_round_up(local_size[1], h)};
-
-	/* Vertical size of 1 is coming from bake/shade kernels where we should
-	 * not round anything up because otherwise we'll either be doing too
-	 * much work per pixel (if we don't check global ID on Y axis) or will
-	 * be checking for global ID to always have Y of 0.
-	 */
-	if(h == 1) {
-		global_size[h] = 1;
-	}
-
-	/* run kernel */
-	opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
-	opencl_assert(clFlush(cqCommandQueue));
+  size_t workgroup_size, max_work_items[3];
+
+  clGetKernelWorkGroupInfo(
+      kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
+  clGetDeviceInfo(
+      cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
+
+  if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
+    workgroup_size = max_workgroup_size;
+  }
+
+  /* Try to divide evenly over 2 dimensions. */
+  size_t local_size[2];
+  if (x_workgroups) {
+    local_size[0] = workgroup_size;
+    local_size[1] = 1;
+  }
+  else {
+    size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
+    local_size[0] = local_size[1] = sqrt_workgroup_size;
+  }
+
+  /* Some implementations have max size 1 on 2nd dimension. */
+  if (local_size[1] > max_work_items[1]) {
+    local_size[0] = workgroup_size / max_work_items[1];
+    local_size[1] = max_work_items[1];
+  }
+
+  size_t global_size[2] = {global_size_round_up(local_size[0], w),
+                           global_size_round_up(local_size[1], h)};
+
+  /* Vertical size of 1 is coming from bake/shade kernels where we should
+   * not round anything up because otherwise we'll either be doing too
+   * much work per pixel (if we don't check global ID on Y axis) or will
+   * be checking for global ID to always have Y of 0.
+   */
+  if (h == 1) {
+    global_size[h] = 1;
+  }
+
+  /* run kernel */
+  opencl_assert(
+      clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
+  opencl_assert(clFlush(cqCommandQueue));
 }
 
 void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
 {
-	cl_mem ptr;
-
-	MemMap::iterator i = mem_map.find(name);
-	if(i != mem_map.end()) {
-		ptr = CL_MEM_PTR(i->second);
-	}
-	else {
-		/* work around NULL not working, even though the spec says otherwise */
-		ptr = CL_MEM_PTR(null_mem);
-	}
-
-	opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
+  cl_mem ptr;
+
+  MemMap::iterator i = mem_map.find(name);
+  if (i != mem_map.end()) {
+    ptr = CL_MEM_PTR(i->second);
+  }
+  else {
+    /* work around NULL not working, even though the spec says otherwise */
+    ptr = CL_MEM_PTR(null_mem);
+  }
+
+  opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
 }
 
 void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
 {
-	flush_texture_buffers();
+  flush_texture_buffers();
 
-	memory_manager.set_kernel_arg_buffers(kernel, narg);
+  memory_manager.set_kernel_arg_buffers(kernel, narg);
 }
 
 void OpenCLDevice::flush_texture_buffers()
 {
-	if(!textures_need_update) {
-		return;
-	}
-	textures_need_update = false;
-
-	/* Setup slots for textures. */
-	int num_slots = 0;
-
-	vector<texture_slot_t> texture_slots;
-
-#define KERNEL_TEX(type, name) \
-	if(textures.find(#name) != textures.end()) { \
-		texture_slots.push_back(texture_slot_t(#name, num_slots)); \
-	} \
-	num_slots++;
-#include "kernel/kernel_textures.h"
-
-	int num_data_slots = num_slots;
-
-	foreach(TexturesMap::value_type& tex, textures) {
-		string name = tex.first;
-
-		if(string_startswith(name, "__tex_image")) {
-			int pos = name.rfind("_");
-			int id = atoi(name.data() + pos + 1);
-			texture_slots.push_back(texture_slot_t(name,
-			                                       num_data_slots + id));
-			num_slots = max(num_slots, num_data_slots + id + 1);
-		}
-	}
-
-	/* Realloc texture descriptors buffer. */
-	memory_manager.free(texture_info);
-	texture_info.resize(num_slots);
-	memory_manager.alloc("texture_info", texture_info);
-
-	/* Fill in descriptors */
-	foreach(texture_slot_t& slot, texture_slots) {
-		TextureInfo& info = texture_info[slot.slot];
-
-		MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
-		info.data = desc.offset;
-		info.cl_buffer = desc.device_buffer;
-
-		if(string_startswith(slot.name, "__tex_image")) {
-			device_memory *mem = textures[slot.name];
-
-			info.width = mem->data_width;
-			info.height = mem->data_height;
-			info.depth = mem->data_depth;
-
-			info.interpolation = mem->interpolation;
-			info.extension = mem->extension;
-		}
-	}
-
-	/* Force write of descriptors. */
-	memory_manager.free(texture_info);
-	memory_manager.alloc("texture_info", texture_info);
-}
+  if (!textures_need_update) {
+    return;
+  }
+  textures_need_update = false;
+
+  /* Setup slots for textures. */
+  int num_slots = 0;
+
+  vector<texture_slot_t> texture_slots;
+
+#  define KERNEL_TEX(type, name) \
+    if (textures.find(#name) != textures.end()) { \
+      texture_slots.push_back(texture_slot_t(#name, num_slots)); \
+    } \
+    num_slots++;
+#  include "kernel/kernel_textures.h"
+
+  int num_data_slots = num_slots;
+
+  foreach (TexturesMap::value_type &tex, textures) {
+    string name = tex.first;
+
+    if (string_startswith(name, "__tex_image")) {
+      int pos = name.rfind("_");
+      int id = atoi(name.data() + pos + 1);
+      texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
+      num_slots = max(num_slots, num_data_slots + id + 1);
+    }
+  }
+
+  /* Realloc texture descriptors buffer. */
+  memory_manager.free(texture_info);
+  texture_info.resize(num_slots);
+  memory_manager.alloc("texture_info", texture_info);
+
+  /* Fill in descriptors */
+  foreach (texture_slot_t &slot, texture_slots) {
+    TextureInfo &info = texture_info[slot.slot];
+
+    MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
+    info.data = desc.offset;
+    info.cl_buffer = desc.device_buffer;
 
+    if (string_startswith(slot.name, "__tex_image")) {
+      device_memory *mem = textures[slot.name];
+
+      info.width = mem->data_width;
+      info.height = mem->data_height;
+      info.depth = mem->data_depth;
+
+      info.interpolation = mem->interpolation;
+      info.extension = mem->extension;
+    }
+  }
+
+  /* Force write of descriptors. */
+  memory_manager.free(texture_info);
+  memory_manager.alloc("texture_info", texture_info);
+}
 
 void OpenCLDevice::thread_run(DeviceTask *task)
 {
-	flush_texture_buffers();
-
-	if(task->type == DeviceTask::FILM_CONVERT) {
-		film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
-	}
-	else if(task->type == DeviceTask::SHADER) {
-		shader(*task);
-	}
-	else if(task->type == DeviceTask::RENDER) {
-		RenderTile tile;
-		DenoisingTask denoising(this, *task);
-
-		/* Allocate buffer for kernel globals */
-		device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
-		kgbuffer.alloc_to_device(1);
-
-		/* Keep rendering tiles until done. */
-		while(task->acquire_tile(this, tile)) {
-			if(tile.task == RenderTile::PATH_TRACE) {
-				assert(tile.task == RenderTile::PATH_TRACE);
-				scoped_timer timer(&tile.buffers->render_time);
-
-				split_kernel->path_trace(task,
-				                         tile,
-				                         kgbuffer,
-				                         *const_mem_map["__data"]);
-
-				/* Complete kernel execution before release tile. */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
-			}
-			else if(tile.task == RenderTile::DENOISE) {
-				tile.sample = tile.start_sample + tile.num_samples;
-				denoise(tile, denoising);
-				task->update_progress(&tile, tile.w*tile.h);
-			}
-
-			task->release_tile(tile);
-		}
-
-		kgbuffer.free();
-	}
+  flush_texture_buffers();
+
+  if (task->type == DeviceTask::FILM_CONVERT) {
+    film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+  }
+  else if (task->type == DeviceTask::SHADER) {
+    shader(*task);
+  }
+  else if (task->type == DeviceTask::RENDER) {
+    RenderTile tile;
+    DenoisingTask denoising(this, *task);
+
+    /* Allocate buffer for kernel globals */
+    device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
+    kgbuffer.alloc_to_device(1);
+
+    /* Keep rendering tiles until done. */
+    while (task->acquire_tile(this, tile)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        assert(tile.task == RenderTile::PATH_TRACE);
+        scoped_timer timer(&tile.buffers->render_time);
+
+        split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
+
+        /* Complete kernel execution before release tile. */
+        /* This helps in multi-device render;
+         * The device that reaches the critical-section function
+         * release_tile waits (stalling other devices from entering
+         * release_tile) for all kernels to complete. If device1 (a
+         * slow-render device) reaches release_tile first then it would
+         * stall device2 (a fast-render device) from proceeding to render
+         * next tile.
+         */
+        clFinish(cqCommandQueue);
+      }
+      else if (tile.task == RenderTile::DENOISE) {
+        tile.sample = tile.start_sample + tile.num_samples;
+        denoise(tile, denoising);
+        task->update_progress(&tile, tile.w * tile.h);
+      }
+
+      task->release_tile(tile);
+    }
+
+    kgbuffer.free();
+  }
 }
 
-void OpenCLDevice::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
+void OpenCLDevice::film_convert(DeviceTask &task,
+                                device_ptr buffer,
+                                device_ptr rgba_byte,
+                                device_ptr rgba_half)
 {
-	/* cast arguments to cl types */
-	cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-	cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
-	cl_mem d_buffer = CL_MEM_PTR(buffer);
-	cl_int d_x = task.x;
-	cl_int d_y = task.y;
-	cl_int d_w = task.w;
-	cl_int d_h = task.h;
-	cl_float d_sample_scale = 1.0f/(task.sample + 1);
-	cl_int d_offset = task.offset;
-	cl_int d_stride = task.stride;
-
-
-	cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
-
-	cl_uint start_arg_index =
-		kernel_set_args(ckFilmConvertKernel,
-		                0,
-		                d_data,
-		                d_rgba,
-		                d_buffer);
-
-	set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
-
-	start_arg_index += kernel_set_args(ckFilmConvertKernel,
-	                                   start_arg_index,
-	                                   d_sample_scale,
-	                                   d_x,
-	                                   d_y,
-	                                   d_w,
-	                                   d_h,
-	                                   d_offset,
-	                                   d_stride);
-
-	enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
+  /* cast arguments to cl types */
+  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+  cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
+  cl_mem d_buffer = CL_MEM_PTR(buffer);
+  cl_int d_x = task.x;
+  cl_int d_y = task.y;
+  cl_int d_w = task.w;
+  cl_int d_h = task.h;
+  cl_float d_sample_scale = 1.0f / (task.sample + 1);
+  cl_int d_offset = task.offset;
+  cl_int d_stride = task.stride;
+
+  cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
+                                                base_program(ustring("convert_to_half_float"));
+
+  cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
+
+  set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
+
+  start_arg_index += kernel_set_args(ckFilmConvertKernel,
+                                     start_arg_index,
+                                     d_sample_scale,
+                                     d_x,
+                                     d_y,
+                                     d_w,
+                                     d_h,
+                                     d_offset,
+                                     d_stride);
+
+  enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
 }
 
 bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
@@ -1419,123 +1406,119 @@ bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
                                              device_ptr out_ptr,
                                              DenoisingTask *task)
 {
-	int stride = task->buffer.stride;
-	int w = task->buffer.width;
-	int h = task->buffer.h;
-	int r = task->nlm_state.r;
-	int f = task->nlm_state.f;
-	float a = task->nlm_state.a;
-	float k_2 = task->nlm_state.k_2;
-
-	int pass_stride = task->buffer.pass_stride;
-	int num_shifts = (2*r+1)*(2*r+1);
-	int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
-
-	device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
-	device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
-	device_sub_ptr weightAccum(task->buffer.temporary_mem, 2*pass_stride*num_shifts, pass_stride);
-	cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
-	cl_mem difference_mem = CL_MEM_PTR(*difference);
-	cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-	cl_mem image_mem = CL_MEM_PTR(image_ptr);
-	cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
-	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-	cl_mem out_mem = CL_MEM_PTR(out_ptr);
-	cl_mem scale_mem = NULL;
-
-	mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride);
-	mem_zero_kernel(out_ptr, sizeof(float)*pass_stride);
-
-	cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-	cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
-	cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
-	cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
-	cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
-
-	kernel_set_args(ckNLMCalcDifference, 0,
-	                guide_mem,
-	                variance_mem,
-	                scale_mem,
-	                difference_mem,
-	                w, h, stride,
-	                pass_stride,
-	                r, channel_offset,
-	                0, a, k_2);
-	kernel_set_args(ckNLMBlur, 0,
-	                difference_mem,
-	                blurDifference_mem,
-	                w, h, stride,
-	                pass_stride,
-	                r, f);
-	kernel_set_args(ckNLMCalcWeight, 0,
-	                blurDifference_mem,
-	                difference_mem,
-	                w, h, stride,
-	                pass_stride,
-	                r, f);
-	kernel_set_args(ckNLMUpdateOutput, 0,
-	                blurDifference_mem,
-	                image_mem,
-	                out_mem,
-	                weightAccum_mem,
-	                w, h, stride,
-	                pass_stride,
-	                channel_offset,
-	                r, f);
-
-	enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
-	enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
-	enqueue_kernel(ckNLMCalcWeight,     w*h, num_shifts, true);
-	enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
-	enqueue_kernel(ckNLMUpdateOutput,   w*h, num_shifts, true);
-
-	kernel_set_args(ckNLMNormalize, 0,
-	                out_mem, weightAccum_mem, w, h, stride);
-	enqueue_kernel(ckNLMNormalize, w, h);
-
-	return true;
+  int stride = task->buffer.stride;
+  int w = task->buffer.width;
+  int h = task->buffer.h;
+  int r = task->nlm_state.r;
+  int f = task->nlm_state.f;
+  float a = task->nlm_state.a;
+  float k_2 = task->nlm_state.k_2;
+
+  int pass_stride = task->buffer.pass_stride;
+  int num_shifts = (2 * r + 1) * (2 * r + 1);
+  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+
+  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
+  device_sub_ptr blurDifference(
+      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
+  device_sub_ptr weightAccum(
+      task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
+  cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
+  cl_mem difference_mem = CL_MEM_PTR(*difference);
+  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
+
+  cl_mem image_mem = CL_MEM_PTR(image_ptr);
+  cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+  cl_mem out_mem = CL_MEM_PTR(out_ptr);
+  cl_mem scale_mem = NULL;
+
+  mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
+  mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
+
+  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+  cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
+  cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
+
+  kernel_set_args(ckNLMCalcDifference,
+                  0,
+                  guide_mem,
+                  variance_mem,
+                  scale_mem,
+                  difference_mem,
+                  w,
+                  h,
+                  stride,
+                  pass_stride,
+                  r,
+                  channel_offset,
+                  0,
+                  a,
+                  k_2);
+  kernel_set_args(
+      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
+  kernel_set_args(
+      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
+  kernel_set_args(ckNLMUpdateOutput,
+                  0,
+                  blurDifference_mem,
+                  image_mem,
+                  out_mem,
+                  weightAccum_mem,
+                  w,
+                  h,
+                  stride,
+                  pass_stride,
+                  channel_offset,
+                  r,
+                  f);
+
+  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
+  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
+  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
+  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
+  enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
+
+  kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
+  enqueue_kernel(ckNLMNormalize, w, h);
+
+  return true;
 }
 
 bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
 {
-	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-	cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-	cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-	cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-	char use_time = task->buffer.use_time? 1 : 0;
-
-	cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
-
-	int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0,
-	                              buffer_mem,
-	                              tile_info_mem);
-	cl_mem buffers[9];
-	for(int i = 0; i < 9; i++) {
-		buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-		arg_ofs += kernel_set_args(ckFilterConstructTransform,
-		                           arg_ofs,
-		                           buffers[i]);
-	}
-	kernel_set_args(ckFilterConstructTransform,
-	                arg_ofs,
-	                transform_mem,
-	                rank_mem,
-	                task->filter_area,
-	                task->rect,
-	                task->buffer.pass_stride,
-	                task->buffer.frame_stride,
-	                use_time,
-	                task->radius,
-	                task->pca_threshold);
-
-	enqueue_kernel(ckFilterConstructTransform,
-	               task->storage.w,
-	               task->storage.h,
-	               256);
-
-	return true;
+  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+  char use_time = task->buffer.use_time ? 1 : 0;
+
+  cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
+
+  int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
+  cl_mem buffers[9];
+  for (int i = 0; i < 9; i++) {
+    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+    arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
+  }
+  kernel_set_args(ckFilterConstructTransform,
+                  arg_ofs,
+                  transform_mem,
+                  rank_mem,
+                  task->filter_area,
+                  task->rect,
+                  task->buffer.pass_stride,
+                  task->buffer.frame_stride,
+                  use_time,
+                  task->radius,
+                  task->pca_threshold);
+
+  enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
+
+  return true;
 }
 
 bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
@@ -1544,136 +1527,130 @@ bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
                                         int frame,
                                         DenoisingTask *task)
 {
-	cl_mem color_mem = CL_MEM_PTR(color_ptr);
-	cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
-	cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
-
-	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-	cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-	cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-	cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-	cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-	cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
-	cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
-	cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
-	cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
-
-	int w = task->reconstruction_state.source_w;
-	int h = task->reconstruction_state.source_h;
-	int stride = task->buffer.stride;
-	int frame_offset = frame * task->buffer.frame_stride;
-	int t = task->tile_info->frames[frame];
-	char use_time = task->buffer.use_time? 1 : 0;
-
-	int r = task->radius;
-	int pass_stride = task->buffer.pass_stride;
-	int num_shifts = (2*r+1)*(2*r+1);
-
-	device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
-	device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
-	cl_mem difference_mem = CL_MEM_PTR(*difference);
-	cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-	kernel_set_args(ckNLMCalcDifference, 0,
-	                color_mem,
-	                color_variance_mem,
-	                scale_mem,
-	                difference_mem,
-	                w, h, stride,
-	                pass_stride,
-	                r,
-	                pass_stride,
-	                frame_offset,
-	                1.0f, task->nlm_k_2);
-	kernel_set_args(ckNLMBlur, 0,
-	                difference_mem,
-	                blurDifference_mem,
-	                w, h, stride,
-	                pass_stride,
-	                r, 4);
-	kernel_set_args(ckNLMCalcWeight, 0,
-	                blurDifference_mem,
-	                difference_mem,
-	                w, h, stride,
-	                pass_stride,
-	                r, 4);
-	kernel_set_args(ckNLMConstructGramian, 0,
-	                t,
-	                blurDifference_mem,
-	                buffer_mem,
-	                transform_mem,
-	                rank_mem,
-	                XtWX_mem,
-	                XtWY_mem,
-	                task->reconstruction_state.filter_window,
-	                w, h, stride,
-	                pass_stride,
-	                r, 4,
-	                frame_offset,
-	                use_time);
-
-	enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
-	enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
-	enqueue_kernel(ckNLMCalcWeight,       w*h, num_shifts, true);
-	enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
-	enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
-
-	return true;
+  cl_mem color_mem = CL_MEM_PTR(color_ptr);
+  cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
+  cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
+
+  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+  cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
+
+  int w = task->reconstruction_state.source_w;
+  int h = task->reconstruction_state.source_h;
+  int stride = task->buffer.stride;
+  int frame_offset = frame * task->buffer.frame_stride;
+  int t = task->tile_info->frames[frame];
+  char use_time = task->buffer.use_time ? 1 : 0;
+
+  int r = task->radius;
+  int pass_stride = task->buffer.pass_stride;
+  int num_shifts = (2 * r + 1) * (2 * r + 1);
+
+  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
+  device_sub_ptr blurDifference(
+      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
+  cl_mem difference_mem = CL_MEM_PTR(*difference);
+  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
+
+  kernel_set_args(ckNLMCalcDifference,
+                  0,
+                  color_mem,
+                  color_variance_mem,
+                  scale_mem,
+                  difference_mem,
+                  w,
+                  h,
+                  stride,
+                  pass_stride,
+                  r,
+                  pass_stride,
+                  frame_offset,
+                  1.0f,
+                  task->nlm_k_2);
+  kernel_set_args(
+      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
+  kernel_set_args(
+      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
+  kernel_set_args(ckNLMConstructGramian,
+                  0,
+                  t,
+                  blurDifference_mem,
+                  buffer_mem,
+                  transform_mem,
+                  rank_mem,
+                  XtWX_mem,
+                  XtWY_mem,
+                  task->reconstruction_state.filter_window,
+                  w,
+                  h,
+                  stride,
+                  pass_stride,
+                  r,
+                  4,
+                  frame_offset,
+                  use_time);
+
+  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
+  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
+  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
+  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
+  enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
+
+  return true;
 }
 
-bool OpenCLDevice::denoising_solve(device_ptr output_ptr,
-                                   DenoisingTask *task)
+bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
 {
-	cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
-
-	cl_mem output_mem = CL_MEM_PTR(output_ptr);
-	cl_mem rank_mem   = CL_MEM_PTR(task->storage.rank.device_pointer);
-	cl_mem XtWX_mem   = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-	cl_mem XtWY_mem   = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-	int w = task->reconstruction_state.source_w;
-	int h = task->reconstruction_state.source_h;
-
-	kernel_set_args(ckFinalize, 0,
-	                output_mem,
-	                rank_mem,
-	                XtWX_mem,
-	                XtWY_mem,
-	                task->filter_area,
-	                task->reconstruction_state.buffer_params,
-	                task->render_buffer.samples);
-	enqueue_kernel(ckFinalize, w, h);
-
-	return true;
+  cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
+
+  cl_mem output_mem = CL_MEM_PTR(output_ptr);
+  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+  int w = task->reconstruction_state.source_w;
+  int h = task->reconstruction_state.source_h;
+
+  kernel_set_args(ckFinalize,
+                  0,
+                  output_mem,
+                  rank_mem,
+                  XtWX_mem,
+                  XtWY_mem,
+                  task->filter_area,
+                  task->reconstruction_state.buffer_params,
+                  task->render_buffer.samples);
+  enqueue_kernel(ckFinalize, w, h);
+
+  return true;
 }
 
 bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
                                             device_ptr b_ptr,
                                             device_ptr mean_ptr,
                                             device_ptr variance_ptr,
-                                            int r, int4 rect,
+                                            int r,
+                                            int4 rect,
                                             DenoisingTask *task)
 {
-	cl_mem a_mem = CL_MEM_PTR(a_ptr);
-	cl_mem b_mem = CL_MEM_PTR(b_ptr);
-	cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-	cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
-
-	kernel_set_args(ckFilterCombineHalves, 0,
-	                mean_mem,
-	                variance_mem,
-	                a_mem,
-	                b_mem,
-	                rect,
-	                r);
-	enqueue_kernel(ckFilterCombineHalves,
-	               task->rect.z-task->rect.x,
-	               task->rect.w-task->rect.y);
-
-	return true;
+  cl_mem a_mem = CL_MEM_PTR(a_ptr);
+  cl_mem b_mem = CL_MEM_PTR(b_ptr);
+  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+  cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
+
+  kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
+  enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  return true;
 }
 
 bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
@@ -1683,39 +1660,36 @@ bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
                                            device_ptr buffer_variance_ptr,
                                            DenoisingTask *task)
 {
-	cl_mem a_mem = CL_MEM_PTR(a_ptr);
-	cl_mem b_mem = CL_MEM_PTR(b_ptr);
-	cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
-	cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
-	cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
-
-	cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-	cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
-
-	int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0,
-	                              task->render_buffer.samples,
-	                              tile_info_mem);
-	cl_mem buffers[9];
-	for(int i = 0; i < 9; i++) {
-		buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-		arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs,
-		                           buffers[i]);
-	}
-	kernel_set_args(ckFilterDivideShadow, arg_ofs,
-	                a_mem,
-	                b_mem,
-	                sample_variance_mem,
-	                sv_variance_mem,
-	                buffer_variance_mem,
-	                task->rect,
-	                task->render_buffer.pass_stride,
-	                task->render_buffer.offset);
-	enqueue_kernel(ckFilterDivideShadow,
-	               task->rect.z-task->rect.x,
-	               task->rect.w-task->rect.y);
-
-	return true;
+  cl_mem a_mem = CL_MEM_PTR(a_ptr);
+  cl_mem b_mem = CL_MEM_PTR(b_ptr);
+  cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
+  cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
+  cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
+
+  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+  cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
+
+  int arg_ofs = kernel_set_args(
+      ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
+  cl_mem buffers[9];
+  for (int i = 0; i < 9; i++) {
+    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+    arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
+  }
+  kernel_set_args(ckFilterDivideShadow,
+                  arg_ofs,
+                  a_mem,
+                  b_mem,
+                  sample_variance_mem,
+                  sv_variance_mem,
+                  buffer_variance_mem,
+                  task->rect,
+                  task->render_buffer.pass_stride,
+                  task->render_buffer.offset);
+  enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  return true;
 }
 
 bool OpenCLDevice::denoising_get_feature(int mean_offset,
@@ -1725,36 +1699,32 @@ bool OpenCLDevice::denoising_get_feature(int mean_offset,
                                          float scale,
                                          DenoisingTask *task)
 {
-	cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-	cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-	cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
-
-	int arg_ofs = kernel_set_args(ckFilterGetFeature, 0,
-	                              task->render_buffer.samples,
-	                              tile_info_mem);
-	cl_mem buffers[9];
-	for(int i = 0; i < 9; i++) {
-		buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-		arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs,
-		                           buffers[i]);
-	}
-	kernel_set_args(ckFilterGetFeature, arg_ofs,
-	                mean_offset,
-	                variance_offset,
-	                mean_mem,
-	                variance_mem,
-	                scale,
-	                task->rect,
-	                task->render_buffer.pass_stride,
-	                task->render_buffer.offset);
-	enqueue_kernel(ckFilterGetFeature,
-	               task->rect.z-task->rect.x,
-	               task->rect.w-task->rect.y);
-
-	return true;
+  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+  cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
+
+  int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
+  cl_mem buffers[9];
+  for (int i = 0; i < 9; i++) {
+    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+    arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
+  }
+  kernel_set_args(ckFilterGetFeature,
+                  arg_ofs,
+                  mean_offset,
+                  variance_offset,
+                  mean_mem,
+                  variance_mem,
+                  scale,
+                  task->rect,
+                  task->render_buffer.pass_stride,
+                  task->render_buffer.offset);
+  enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  return true;
 }
 
 bool OpenCLDevice::denoising_write_feature(int out_offset,
@@ -1762,24 +1732,23 @@ bool OpenCLDevice::denoising_write_feature(int out_offset,
                                            device_ptr buffer_ptr,
                                            DenoisingTask *task)
 {
-	cl_mem from_mem = CL_MEM_PTR(from_ptr);
-	cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
-
-	cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
-
-	kernel_set_args(ckFilterWriteFeature, 0,
-	                task->render_buffer.samples,
-	                task->reconstruction_state.buffer_params,
-	                task->filter_area,
-	                from_mem,
-	                buffer_mem,
-	                out_offset,
-	                task->rect);
-	enqueue_kernel(ckFilterWriteFeature,
-	               task->filter_area.z,
-	               task->filter_area.w);
-
-	return true;
+  cl_mem from_mem = CL_MEM_PTR(from_ptr);
+  cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
+
+  cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
+
+  kernel_set_args(ckFilterWriteFeature,
+                  0,
+                  task->render_buffer.samples,
+                  task->reconstruction_state.buffer_params,
+                  task->filter_area,
+                  from_mem,
+                  buffer_mem,
+                  out_offset,
+                  task->rect);
+  enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
+
+  return true;
 }
 
 bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
@@ -1788,155 +1757,155 @@ bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
                                              device_ptr output_ptr,
                                              DenoisingTask *task)
 {
-	cl_mem image_mem = CL_MEM_PTR(image_ptr);
-	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-	cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
-	cl_mem output_mem = CL_MEM_PTR(output_ptr);
-
-	cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
-
-	kernel_set_args(ckFilterDetectOutliers, 0,
-	                image_mem,
-	                variance_mem,
-	                depth_mem,
-	                output_mem,
-	                task->rect,
-	                task->buffer.pass_stride);
-	enqueue_kernel(ckFilterDetectOutliers,
-	               task->rect.z-task->rect.x,
-	               task->rect.w-task->rect.y);
-
-	return true;
+  cl_mem image_mem = CL_MEM_PTR(image_ptr);
+  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+  cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
+  cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+  cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
+
+  kernel_set_args(ckFilterDetectOutliers,
+                  0,
+                  image_mem,
+                  variance_mem,
+                  depth_mem,
+                  output_mem,
+                  task->rect,
+                  task->buffer.pass_stride);
+  enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  return true;
 }
 
-void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask& denoising)
+void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
 {
-	denoising.functions.construct_transform = function_bind(&OpenCLDevice::denoising_construct_transform, this, &denoising);
-	denoising.functions.accumulate = function_bind(&OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-	denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
-	denoising.functions.divide_shadow = function_bind(&OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-	denoising.functions.non_local_means = function_bind(&OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-	denoising.functions.combine_halves = function_bind(&OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-	denoising.functions.get_feature = function_bind(&OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-	denoising.functions.write_feature = function_bind(&OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-	denoising.functions.detect_outliers = function_bind(&OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-	denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-	denoising.render_buffer.samples = rtile.sample;
-	denoising.buffer.gpu_temporary_mem = true;
-
-	denoising.run_denoising(&rtile);
+  denoising.functions.construct_transform = function_bind(
+      &OpenCLDevice::denoising_construct_transform, this, &denoising);
+  denoising.functions.accumulate = function_bind(
+      &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+  denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
+  denoising.functions.divide_shadow = function_bind(
+      &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+  denoising.functions.non_local_means = function_bind(
+      &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+  denoising.functions.combine_halves = function_bind(
+      &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+  denoising.functions.get_feature = function_bind(
+      &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+  denoising.functions.write_feature = function_bind(
+      &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
+  denoising.functions.detect_outliers = function_bind(
+      &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+  denoising.render_buffer.samples = rtile.sample;
+  denoising.buffer.gpu_temporary_mem = true;
+
+  denoising.run_denoising(&rtile);
 }
 
-void OpenCLDevice::shader(DeviceTask& task)
+void OpenCLDevice::shader(DeviceTask &task)
 {
-	/* cast arguments to cl types */
-	cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-	cl_mem d_input = CL_MEM_PTR(task.shader_input);
-	cl_mem d_output = CL_MEM_PTR(task.shader_output);
-	cl_int d_shader_eval_type = task.shader_eval_type;
-	cl_int d_shader_filter = task.shader_filter;
-	cl_int d_shader_x = task.shader_x;
-	cl_int d_shader_w = task.shader_w;
-	cl_int d_offset = task.offset;
-
-	OpenCLDevice::OpenCLProgram *program = &background_program;
-	if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
-		program = &bake_program;
-	}
-	else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-		program = &displace_program;
-	}
-	program->wait_for_availability();
-	cl_kernel kernel = (*program)();
-
-	cl_uint start_arg_index =
-		kernel_set_args(kernel,
-		                0,
-		                d_data,
-		                d_input,
-		                d_output);
-
-	set_kernel_arg_buffers(kernel, &start_arg_index);
-
-	start_arg_index += kernel_set_args(kernel,
-	                                   start_arg_index,
-	                                   d_shader_eval_type);
-	if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
-		start_arg_index += kernel_set_args(kernel,
-		                                   start_arg_index,
-		                                   d_shader_filter);
-	}
-	start_arg_index += kernel_set_args(kernel,
-	                                   start_arg_index,
-	                                   d_shader_x,
-	                                   d_shader_w,
-	                                   d_offset);
-
-	for(int sample = 0; sample < task.num_samples; sample++) {
-
-		if(task.get_cancel())
-			break;
-
-		kernel_set_args(kernel, start_arg_index, sample);
-
-		enqueue_kernel(kernel, task.shader_w, 1);
-
-		clFinish(cqCommandQueue);
-
-		task.update_progress(NULL);
-	}
+  /* cast arguments to cl types */
+  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+  cl_mem d_input = CL_MEM_PTR(task.shader_input);
+  cl_mem d_output = CL_MEM_PTR(task.shader_output);
+  cl_int d_shader_eval_type = task.shader_eval_type;
+  cl_int d_shader_filter = task.shader_filter;
+  cl_int d_shader_x = task.shader_x;
+  cl_int d_shader_w = task.shader_w;
+  cl_int d_offset = task.offset;
+
+  OpenCLDevice::OpenCLProgram *program = &background_program;
+  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+    program = &bake_program;
+  }
+  else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+    program = &displace_program;
+  }
+  program->wait_for_availability();
+  cl_kernel kernel = (*program)();
+
+  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
+
+  set_kernel_arg_buffers(kernel, &start_arg_index);
+
+  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
+  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+    start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
+  }
+  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
+
+  for (int sample = 0; sample < task.num_samples; sample++) {
+
+    if (task.get_cancel())
+      break;
+
+    kernel_set_args(kernel, start_arg_index, sample);
+
+    enqueue_kernel(kernel, task.shader_w, 1);
+
+    clFinish(cqCommandQueue);
+
+    task.update_progress(NULL);
+  }
 }
 
 string OpenCLDevice::kernel_build_options(const string *debug_src)
 {
-	string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
-	if(platform_name == "NVIDIA CUDA") {
-		build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
-		                 "-cl-nv-maxrregcount=32 "
-		                 "-cl-nv-verbose ";
-
-		uint compute_capability_major, compute_capability_minor;
-		clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
-		                sizeof(cl_uint), &compute_capability_major, NULL);
-		clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
-		                sizeof(cl_uint), &compute_capability_minor, NULL);
-
-		build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
-		                               compute_capability_major * 100 +
-		                               compute_capability_minor * 10);
-	}
-
-	else if(platform_name == "Apple")
-		build_options += "-D__KERNEL_OPENCL_APPLE__ ";
-
-	else if(platform_name == "AMD Accelerated Parallel Processing")
-		build_options += "-D__KERNEL_OPENCL_AMD__ ";
-
-	else if(platform_name == "Intel(R) OpenCL") {
-		build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
-
-		/* Options for gdb source level kernel debugging.
-		 * this segfaults on linux currently.
-		 */
-		if(OpenCLInfo::use_debug() && debug_src)
-			build_options += "-g -s \"" + *debug_src + "\" ";
-	}
-
-	if(info.has_half_images) {
-		build_options += "-D__KERNEL_CL_KHR_FP16__ ";
-	}
-
-	if(OpenCLInfo::use_debug()) {
-		build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
-	}
-
-#ifdef WITH_CYCLES_DEBUG
-	build_options += "-D__KERNEL_DEBUG__ ";
-#endif
-
-	return build_options;
+  string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
+
+  if (platform_name == "NVIDIA CUDA") {
+    build_options +=
+        "-D__KERNEL_OPENCL_NVIDIA__ "
+        "-cl-nv-maxrregcount=32 "
+        "-cl-nv-verbose ";
+
+    uint compute_capability_major, compute_capability_minor;
+    clGetDeviceInfo(cdDevice,
+                    CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                    sizeof(cl_uint),
+                    &compute_capability_major,
+                    NULL);
+    clGetDeviceInfo(cdDevice,
+                    CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                    sizeof(cl_uint),
+                    &compute_capability_minor,
+                    NULL);
+
+    build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
+                                   compute_capability_major * 100 + compute_capability_minor * 10);
+  }
+
+  else if (platform_name == "Apple")
+    build_options += "-D__KERNEL_OPENCL_APPLE__ ";
+
+  else if (platform_name == "AMD Accelerated Parallel Processing")
+    build_options += "-D__KERNEL_OPENCL_AMD__ ";
+
+  else if (platform_name == "Intel(R) OpenCL") {
+    build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
+
+    /* Options for gdb source level kernel debugging.
+     * this segfaults on linux currently.
+     */
+    if (OpenCLInfo::use_debug() && debug_src)
+      build_options += "-g -s \"" + *debug_src + "\" ";
+  }
+
+  if (info.has_half_images) {
+    build_options += "-D__KERNEL_CL_KHR_FP16__ ";
+  }
+
+  if (OpenCLInfo::use_debug()) {
+    build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
+  }
+
+#  ifdef WITH_CYCLES_DEBUG
+  build_options += "-D__KERNEL_DEBUG__ ";
+#  endif
+
+  return build_options;
 }
 
 /* TODO(sergey): In the future we can use variadic templates, once
@@ -1944,137 +1913,130 @@ string OpenCLDevice::kernel_build_options(const string *debug_src)
  */
 int OpenCLDevice::kernel_set_args(cl_kernel kernel,
                                   int start_argument_index,
-                                  const ArgumentWrapper& arg1,
-                                  const ArgumentWrapper& arg2,
-                                  const ArgumentWrapper& arg3,
-                                  const ArgumentWrapper& arg4,
-                                  const ArgumentWrapper& arg5,
-                                  const ArgumentWrapper& arg6,
-                                  const ArgumentWrapper& arg7,
-                                  const ArgumentWrapper& arg8,
-                                  const ArgumentWrapper& arg9,
-                                  const ArgumentWrapper& arg10,
-                                  const ArgumentWrapper& arg11,
-                                  const ArgumentWrapper& arg12,
-                                  const ArgumentWrapper& arg13,
-                                  const ArgumentWrapper& arg14,
-                                  const ArgumentWrapper& arg15,
-                                  const ArgumentWrapper& arg16,
-                                  const ArgumentWrapper& arg17,
-                                  const ArgumentWrapper& arg18,
-                                  const ArgumentWrapper& arg19,
-                                  const ArgumentWrapper& arg20,
-                                  const ArgumentWrapper& arg21,
-                                  const ArgumentWrapper& arg22,
-                                  const ArgumentWrapper& arg23,
-                                  const ArgumentWrapper& arg24,
-                                  const ArgumentWrapper& arg25,
-                                  const ArgumentWrapper& arg26,
-                                  const ArgumentWrapper& arg27,
-                                  const ArgumentWrapper& arg28,
-                                  const ArgumentWrapper& arg29,
-                                  const ArgumentWrapper& arg30,
-                                  const ArgumentWrapper& arg31,
-                                  const ArgumentWrapper& arg32,
-                                  const ArgumentWrapper& arg33)
+                                  const ArgumentWrapper &arg1,
+                                  const ArgumentWrapper &arg2,
+                                  const ArgumentWrapper &arg3,
+                                  const ArgumentWrapper &arg4,
+                                  const ArgumentWrapper &arg5,
+                                  const ArgumentWrapper &arg6,
+                                  const ArgumentWrapper &arg7,
+                                  const ArgumentWrapper &arg8,
+                                  const ArgumentWrapper &arg9,
+                                  const ArgumentWrapper &arg10,
+                                  const ArgumentWrapper &arg11,
+                                  const ArgumentWrapper &arg12,
+                                  const ArgumentWrapper &arg13,
+                                  const ArgumentWrapper &arg14,
+                                  const ArgumentWrapper &arg15,
+                                  const ArgumentWrapper &arg16,
+                                  const ArgumentWrapper &arg17,
+                                  const ArgumentWrapper &arg18,
+                                  const ArgumentWrapper &arg19,
+                                  const ArgumentWrapper &arg20,
+                                  const ArgumentWrapper &arg21,
+                                  const ArgumentWrapper &arg22,
+                                  const ArgumentWrapper &arg23,
+                                  const ArgumentWrapper &arg24,
+                                  const ArgumentWrapper &arg25,
+                                  const ArgumentWrapper &arg26,
+                                  const ArgumentWrapper &arg27,
+                                  const ArgumentWrapper &arg28,
+                                  const ArgumentWrapper &arg29,
+                                  const ArgumentWrapper &arg30,
+                                  const ArgumentWrapper &arg31,
+                                  const ArgumentWrapper &arg32,
+                                  const ArgumentWrapper &arg33)
 {
-	int current_arg_index = 0;
-#define FAKE_VARARG_HANDLE_ARG(arg) \
-	do { \
-		if(arg.pointer != NULL) { \
-			opencl_assert(clSetKernelArg( \
-				kernel, \
-				start_argument_index + current_arg_index, \
-				arg.size, arg.pointer)); \
-			++current_arg_index; \
-		} \
-		else { \
-			return current_arg_index; \
-		} \
-	} while(false)
-	FAKE_VARARG_HANDLE_ARG(arg1);
-	FAKE_VARARG_HANDLE_ARG(arg2);
-	FAKE_VARARG_HANDLE_ARG(arg3);
-	FAKE_VARARG_HANDLE_ARG(arg4);
-	FAKE_VARARG_HANDLE_ARG(arg5);
-	FAKE_VARARG_HANDLE_ARG(arg6);
-	FAKE_VARARG_HANDLE_ARG(arg7);
-	FAKE_VARARG_HANDLE_ARG(arg8);
-	FAKE_VARARG_HANDLE_ARG(arg9);
-	FAKE_VARARG_HANDLE_ARG(arg10);
-	FAKE_VARARG_HANDLE_ARG(arg11);
-	FAKE_VARARG_HANDLE_ARG(arg12);
-	FAKE_VARARG_HANDLE_ARG(arg13);
-	FAKE_VARARG_HANDLE_ARG(arg14);
-	FAKE_VARARG_HANDLE_ARG(arg15);
-	FAKE_VARARG_HANDLE_ARG(arg16);
-	FAKE_VARARG_HANDLE_ARG(arg17);
-	FAKE_VARARG_HANDLE_ARG(arg18);
-	FAKE_VARARG_HANDLE_ARG(arg19);
-	FAKE_VARARG_HANDLE_ARG(arg20);
-	FAKE_VARARG_HANDLE_ARG(arg21);
-	FAKE_VARARG_HANDLE_ARG(arg22);
-	FAKE_VARARG_HANDLE_ARG(arg23);
-	FAKE_VARARG_HANDLE_ARG(arg24);
-	FAKE_VARARG_HANDLE_ARG(arg25);
-	FAKE_VARARG_HANDLE_ARG(arg26);
-	FAKE_VARARG_HANDLE_ARG(arg27);
-	FAKE_VARARG_HANDLE_ARG(arg28);
-	FAKE_VARARG_HANDLE_ARG(arg29);
-	FAKE_VARARG_HANDLE_ARG(arg30);
-	FAKE_VARARG_HANDLE_ARG(arg31);
-	FAKE_VARARG_HANDLE_ARG(arg32);
-	FAKE_VARARG_HANDLE_ARG(arg33);
-#undef FAKE_VARARG_HANDLE_ARG
-	return current_arg_index;
+  int current_arg_index = 0;
+#  define FAKE_VARARG_HANDLE_ARG(arg) \
+    do { \
+      if (arg.pointer != NULL) { \
+        opencl_assert(clSetKernelArg( \
+            kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
+        ++current_arg_index; \
+      } \
+      else { \
+        return current_arg_index; \
+      } \
+    } while (false)
+  FAKE_VARARG_HANDLE_ARG(arg1);
+  FAKE_VARARG_HANDLE_ARG(arg2);
+  FAKE_VARARG_HANDLE_ARG(arg3);
+  FAKE_VARARG_HANDLE_ARG(arg4);
+  FAKE_VARARG_HANDLE_ARG(arg5);
+  FAKE_VARARG_HANDLE_ARG(arg6);
+  FAKE_VARARG_HANDLE_ARG(arg7);
+  FAKE_VARARG_HANDLE_ARG(arg8);
+  FAKE_VARARG_HANDLE_ARG(arg9);
+  FAKE_VARARG_HANDLE_ARG(arg10);
+  FAKE_VARARG_HANDLE_ARG(arg11);
+  FAKE_VARARG_HANDLE_ARG(arg12);
+  FAKE_VARARG_HANDLE_ARG(arg13);
+  FAKE_VARARG_HANDLE_ARG(arg14);
+  FAKE_VARARG_HANDLE_ARG(arg15);
+  FAKE_VARARG_HANDLE_ARG(arg16);
+  FAKE_VARARG_HANDLE_ARG(arg17);
+  FAKE_VARARG_HANDLE_ARG(arg18);
+  FAKE_VARARG_HANDLE_ARG(arg19);
+  FAKE_VARARG_HANDLE_ARG(arg20);
+  FAKE_VARARG_HANDLE_ARG(arg21);
+  FAKE_VARARG_HANDLE_ARG(arg22);
+  FAKE_VARARG_HANDLE_ARG(arg23);
+  FAKE_VARARG_HANDLE_ARG(arg24);
+  FAKE_VARARG_HANDLE_ARG(arg25);
+  FAKE_VARARG_HANDLE_ARG(arg26);
+  FAKE_VARARG_HANDLE_ARG(arg27);
+  FAKE_VARARG_HANDLE_ARG(arg28);
+  FAKE_VARARG_HANDLE_ARG(arg29);
+  FAKE_VARARG_HANDLE_ARG(arg30);
+  FAKE_VARARG_HANDLE_ARG(arg31);
+  FAKE_VARARG_HANDLE_ARG(arg32);
+  FAKE_VARARG_HANDLE_ARG(arg33);
+#  undef FAKE_VARARG_HANDLE_ARG
+  return current_arg_index;
 }
 
 void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
 {
-	if(kernel) {
-		clReleaseKernel(kernel);
-	}
+  if (kernel) {
+    clReleaseKernel(kernel);
+  }
 }
 
 void OpenCLDevice::release_mem_object_safe(cl_mem mem)
 {
-	if(mem != NULL) {
-		clReleaseMemObject(mem);
-	}
+  if (mem != NULL) {
+    clReleaseMemObject(mem);
+  }
 }
 
 void OpenCLDevice::release_program_safe(cl_program program)
 {
-	if(program) {
-		clReleaseProgram(program);
-	}
+  if (program) {
+    clReleaseProgram(program);
+  }
 }
 
 /* ** Those guys are for workign around some compiler-specific bugs ** */
 
-cl_program OpenCLDevice::load_cached_kernel(ustring key,
-                                            thread_scoped_lock& cache_locker)
+cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
 {
-	return OpenCLCache::get_program(cpPlatform,
-	                                cdDevice,
-	                                key,
-	                                cache_locker);
+  return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
 }
 
 void OpenCLDevice::store_cached_kernel(cl_program program,
                                        ustring key,
-                                       thread_scoped_lock& cache_locker)
+                                       thread_scoped_lock &cache_locker)
 {
-	OpenCLCache::store_program(cpPlatform,
-	                           cdDevice,
-	                           program,
-	                           key,
-	                           cache_locker);
+  OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
 }
 
-Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background)
+Device *opencl_create_split_device(DeviceInfo &info,
+                                   Stats &stats,
+                                   Profiler &profiler,
+                                   bool background)
 {
-	return new OpenCLDevice(info, stats, profiler, background);
+  return new OpenCLDevice(info, stats, profiler, background);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 5a1e12af8ab..cc40ad42b06 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -16,1059 +16,1017 @@
 
 #ifdef WITH_OPENCL
 
-#include "device/opencl/opencl.h"
-#include "device/device_intern.h"
+#  include "device/opencl/opencl.h"
+#  include "device/device_intern.h"
 
-#include "util/util_debug.h"
-#include "util/util_logging.h"
-#include "util/util_md5.h"
-#include "util/util_path.h"
-#include "util/util_time.h"
-#include "util/util_system.h"
+#  include "util/util_debug.h"
+#  include "util/util_logging.h"
+#  include "util/util_md5.h"
+#  include "util/util_path.h"
+#  include "util/util_time.h"
+#  include "util/util_system.h"
 
 using std::cerr;
 using std::endl;
 
 CCL_NAMESPACE_BEGIN
 
-OpenCLCache::Slot::ProgramEntry::ProgramEntry()
- : program(NULL),
-   mutex(NULL)
+OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL)
 {
 }
 
-OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry& rhs)
- : program(rhs.program),
-   mutex(NULL)
+OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs)
+    : program(rhs.program), mutex(NULL)
 {
 }
 
 OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
 {
-	delete mutex;
+  delete mutex;
 }
 
-OpenCLCache::Slot::Slot()
- : context_mutex(NULL),
-   context(NULL)
+OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL)
 {
 }
 
-OpenCLCache::Slot::Slot(const Slot& rhs)
- : context_mutex(NULL),
-   context(NULL),
-   programs(rhs.programs)
+OpenCLCache::Slot::Slot(const Slot &rhs)
+    : context_mutex(NULL), context(NULL), programs(rhs.programs)
 {
 }
 
 OpenCLCache::Slot::~Slot()
 {
-	delete context_mutex;
+  delete context_mutex;
 }
 
-OpenCLCache& OpenCLCache::global_instance()
+OpenCLCache &OpenCLCache::global_instance()
 {
-	static OpenCLCache instance;
-	return instance;
+  static OpenCLCache instance;
+  return instance;
 }
 
 cl_context OpenCLCache::get_context(cl_platform_id platform,
                                     cl_device_id device,
-                                    thread_scoped_lock& slot_locker)
+                                    thread_scoped_lock &slot_locker)
 {
-	assert(platform != NULL);
+  assert(platform != NULL);
 
-	OpenCLCache& self = global_instance();
+  OpenCLCache &self = global_instance();
 
-	thread_scoped_lock cache_lock(self.cache_lock);
+  thread_scoped_lock cache_lock(self.cache_lock);
 
-	pair<CacheMap::iterator,bool> ins = self.cache.insert(
-		CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
+  pair<CacheMap::iterator, bool> ins = self.cache.insert(
+      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
 
-	Slot &slot = ins.first->second;
+  Slot &slot = ins.first->second;
 
-	/* create slot lock only while holding cache lock */
-	if(!slot.context_mutex)
-		slot.context_mutex = new thread_mutex;
+  /* create slot lock only while holding cache lock */
+  if (!slot.context_mutex)
+    slot.context_mutex = new thread_mutex;
 
-	/* need to unlock cache before locking slot, to allow store to complete */
-	cache_lock.unlock();
+  /* need to unlock cache before locking slot, to allow store to complete */
+  cache_lock.unlock();
 
-	/* lock the slot */
-	slot_locker = thread_scoped_lock(*slot.context_mutex);
+  /* lock the slot */
+  slot_locker = thread_scoped_lock(*slot.context_mutex);
 
-	/* If the thing isn't cached */
-	if(slot.context == NULL) {
-		/* return with the caller's lock holder holding the slot lock */
-		return NULL;
-	}
+  /* If the thing isn't cached */
+  if (slot.context == NULL) {
+    /* return with the caller's lock holder holding the slot lock */
+    return NULL;
+  }
 
-	/* the item was already cached, release the slot lock */
-	slot_locker.unlock();
+  /* the item was already cached, release the slot lock */
+  slot_locker.unlock();
 
-	cl_int ciErr = clRetainContext(slot.context);
-	assert(ciErr == CL_SUCCESS);
-	(void) ciErr;
+  cl_int ciErr = clRetainContext(slot.context);
+  assert(ciErr == CL_SUCCESS);
+  (void)ciErr;
 
-	return slot.context;
+  return slot.context;
 }
 
 cl_program OpenCLCache::get_program(cl_platform_id platform,
                                     cl_device_id device,
                                     ustring key,
-                                    thread_scoped_lock& slot_locker)
+                                    thread_scoped_lock &slot_locker)
 {
-	assert(platform != NULL);
+  assert(platform != NULL);
 
-	OpenCLCache& self = global_instance();
+  OpenCLCache &self = global_instance();
 
-	thread_scoped_lock cache_lock(self.cache_lock);
+  thread_scoped_lock cache_lock(self.cache_lock);
 
-	pair<CacheMap::iterator,bool> ins = self.cache.insert(
-		CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
+  pair<CacheMap::iterator, bool> ins = self.cache.insert(
+      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
 
-	Slot &slot = ins.first->second;
+  Slot &slot = ins.first->second;
 
-	pair<Slot::EntryMap::iterator,bool> ins2 = slot.programs.insert(
-		Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
+  pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert(
+      Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
 
-	Slot::ProgramEntry &entry = ins2.first->second;
+  Slot::ProgramEntry &entry = ins2.first->second;
 
-	/* create slot lock only while holding cache lock */
-	if(!entry.mutex)
-		entry.mutex = new thread_mutex;
+  /* create slot lock only while holding cache lock */
+  if (!entry.mutex)
+    entry.mutex = new thread_mutex;
 
-	/* need to unlock cache before locking slot, to allow store to complete */
-	cache_lock.unlock();
+  /* need to unlock cache before locking slot, to allow store to complete */
+  cache_lock.unlock();
 
-	/* lock the slot */
-	slot_locker = thread_scoped_lock(*entry.mutex);
+  /* lock the slot */
+  slot_locker = thread_scoped_lock(*entry.mutex);
 
-	/* If the thing isn't cached */
-	if(entry.program == NULL) {
-		/* return with the caller's lock holder holding the slot lock */
-		return NULL;
-	}
+  /* If the thing isn't cached */
+  if (entry.program == NULL) {
+    /* return with the caller's lock holder holding the slot lock */
+    return NULL;
+  }
 
-	/* the item was already cached, release the slot lock */
-	slot_locker.unlock();
+  /* the item was already cached, release the slot lock */
+  slot_locker.unlock();
 
-	cl_int ciErr = clRetainProgram(entry.program);
-	assert(ciErr == CL_SUCCESS);
-	(void) ciErr;
+  cl_int ciErr = clRetainProgram(entry.program);
+  assert(ciErr == CL_SUCCESS);
+  (void)ciErr;
 
-	return entry.program;
+  return entry.program;
 }
 
 void OpenCLCache::store_context(cl_platform_id platform,
                                 cl_device_id device,
                                 cl_context context,
-                                thread_scoped_lock& slot_locker)
+                                thread_scoped_lock &slot_locker)
 {
-	assert(platform != NULL);
-	assert(device != NULL);
-	assert(context != NULL);
+  assert(platform != NULL);
+  assert(device != NULL);
+  assert(context != NULL);
 
-	OpenCLCache &self = global_instance();
+  OpenCLCache &self = global_instance();
 
-	thread_scoped_lock cache_lock(self.cache_lock);
-	CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-	cache_lock.unlock();
+  thread_scoped_lock cache_lock(self.cache_lock);
+  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
+  cache_lock.unlock();
 
-	Slot &slot = i->second;
+  Slot &slot = i->second;
 
-	/* sanity check */
-	assert(i != self.cache.end());
-	assert(slot.context == NULL);
+  /* sanity check */
+  assert(i != self.cache.end());
+  assert(slot.context == NULL);
 
-	slot.context = context;
+  slot.context = context;
 
-	/* unlock the slot */
-	slot_locker.unlock();
+  /* unlock the slot */
+  slot_locker.unlock();
 
-	/* increment reference count in OpenCL.
-	 * The caller is going to release the object when done with it. */
-	cl_int ciErr = clRetainContext(context);
-	assert(ciErr == CL_SUCCESS);
-	(void) ciErr;
+  /* increment reference count in OpenCL.
+   * The caller is going to release the object when done with it. */
+  cl_int ciErr = clRetainContext(context);
+  assert(ciErr == CL_SUCCESS);
+  (void)ciErr;
 }
 
 void OpenCLCache::store_program(cl_platform_id platform,
                                 cl_device_id device,
                                 cl_program program,
                                 ustring key,
-                                thread_scoped_lock& slot_locker)
+                                thread_scoped_lock &slot_locker)
 {
-	assert(platform != NULL);
-	assert(device != NULL);
-	assert(program != NULL);
+  assert(platform != NULL);
+  assert(device != NULL);
+  assert(program != NULL);
 
-	OpenCLCache &self = global_instance();
+  OpenCLCache &self = global_instance();
 
-	thread_scoped_lock cache_lock(self.cache_lock);
+  thread_scoped_lock cache_lock(self.cache_lock);
 
-	CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-	assert(i != self.cache.end());
-	Slot &slot = i->second;
+  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
+  assert(i != self.cache.end());
+  Slot &slot = i->second;
 
-	Slot::EntryMap::iterator i2 = slot.programs.find(key);
-	assert(i2 != slot.programs.end());
-	Slot::ProgramEntry &entry = i2->second;
+  Slot::EntryMap::iterator i2 = slot.programs.find(key);
+  assert(i2 != slot.programs.end());
+  Slot::ProgramEntry &entry = i2->second;
 
-	assert(entry.program == NULL);
+  assert(entry.program == NULL);
 
-	cache_lock.unlock();
+  cache_lock.unlock();
 
-	entry.program = program;
+  entry.program = program;
 
-	/* unlock the slot */
-	slot_locker.unlock();
+  /* unlock the slot */
+  slot_locker.unlock();
 
-	/* Increment reference count in OpenCL.
-	 * The caller is going to release the object when done with it.
-	 */
-	cl_int ciErr = clRetainProgram(program);
-	assert(ciErr == CL_SUCCESS);
-	(void) ciErr;
+  /* Increment reference count in OpenCL.
+   * The caller is going to release the object when done with it.
+   */
+  cl_int ciErr = clRetainProgram(program);
+  assert(ciErr == CL_SUCCESS);
+  (void)ciErr;
 }
 
 string OpenCLCache::get_kernel_md5()
 {
-	OpenCLCache &self = global_instance();
-	thread_scoped_lock lock(self.kernel_md5_lock);
+  OpenCLCache &self = global_instance();
+  thread_scoped_lock lock(self.kernel_md5_lock);
 
-	if(self.kernel_md5.empty()) {
-		self.kernel_md5 = path_files_md5_hash(path_get("source"));
-	}
-	return self.kernel_md5;
+  if (self.kernel_md5.empty()) {
+    self.kernel_md5 = path_files_md5_hash(path_get("source"));
+  }
+  return self.kernel_md5;
 }
 
-static string get_program_source(const string& kernel_file)
+static string get_program_source(const string &kernel_file)
 {
-	string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
-	/* We compile kernels consisting of many files. unfortunately OpenCL
-	 * kernel caches do not seem to recognize changes in included files.
-	 * so we force recompile on changes by adding the md5 hash of all files.
-	 */
-	source = path_source_replace_includes(source, path_get("source"));
-	source += "\n// " + util_md5_string(source) + "\n";
-	return source;
+  string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
+  /* We compile kernels consisting of many files. unfortunately OpenCL
+   * kernel caches do not seem to recognize changes in included files.
+   * so we force recompile on changes by adding the md5 hash of all files.
+   */
+  source = path_source_replace_includes(source, path_get("source"));
+  source += "\n// " + util_md5_string(source) + "\n";
+  return source;
 }
 
 OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
-                                               const string& program_name,
-                                               const string& kernel_file,
-                                               const string& kernel_build_options,
-                                               bool use_stdout)
- : device(device),
-   program_name(program_name),
-   kernel_file(kernel_file),
-   kernel_build_options(kernel_build_options),
-   use_stdout(use_stdout)
+                                           const string &program_name,
+                                           const string &kernel_file,
+                                           const string &kernel_build_options,
+                                           bool use_stdout)
+    : device(device),
+      program_name(program_name),
+      kernel_file(kernel_file),
+      kernel_build_options(kernel_build_options),
+      use_stdout(use_stdout)
 {
-	loaded = false;
-	needs_compiling = true;
-	program = NULL;
+  loaded = false;
+  needs_compiling = true;
+  program = NULL;
 }
 
 OpenCLDevice::OpenCLProgram::~OpenCLProgram()
 {
-	release();
+  release();
 }
 
 void OpenCLDevice::OpenCLProgram::release()
 {
-	for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) {
-		if(kernel->second) {
-			clReleaseKernel(kernel->second);
-			kernel->second = NULL;
-		}
-	}
-	if(program) {
-		clReleaseProgram(program);
-		program = NULL;
-	}
+  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
+       ++kernel) {
+    if (kernel->second) {
+      clReleaseKernel(kernel->second);
+      kernel->second = NULL;
+    }
+  }
+  if (program) {
+    clReleaseProgram(program);
+    program = NULL;
+  }
 }
 
-void OpenCLDevice::OpenCLProgram::add_log(const string& msg, bool debug)
+void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug)
 {
-	if(!use_stdout) {
-		log += msg + "\n";
-	}
-	else if(!debug) {
-		printf("%s\n", msg.c_str());
-		fflush(stdout);
-	}
-	else {
-		VLOG(2) << msg;
-	}
+  if (!use_stdout) {
+    log += msg + "\n";
+  }
+  else if (!debug) {
+    printf("%s\n", msg.c_str());
+    fflush(stdout);
+  }
+  else {
+    VLOG(2) << msg;
+  }
 }
 
-void OpenCLDevice::OpenCLProgram::add_error(const string& msg)
+void OpenCLDevice::OpenCLProgram::add_error(const string &msg)
 {
-	if(use_stdout) {
-		fprintf(stderr, "%s\n", msg.c_str());
-	}
-	if(error_msg == "") {
-		error_msg += "\n";
-	}
-	error_msg += msg;
+  if (use_stdout) {
+    fprintf(stderr, "%s\n", msg.c_str());
+  }
+  if (error_msg == "") {
+    error_msg += "\n";
+  }
+  error_msg += msg;
 }
 
 void OpenCLDevice::OpenCLProgram::add_kernel(ustring name)
 {
-	if(!kernels.count(name)) {
-		kernels[name] = NULL;
-	}
+  if (!kernels.count(name)) {
+    kernels[name] = NULL;
+  }
 }
 
 bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src)
 {
-	string build_options;
-	build_options = device->kernel_build_options(debug_src) + kernel_build_options;
+  string build_options;
+  build_options = device->kernel_build_options(debug_src) + kernel_build_options;
 
-	VLOG(1) << "Build options passed to clBuildProgram: '"
-	        << build_options << "'.";
-	cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
+  VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'.";
+  cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
 
-	/* show warnings even if build is successful */
-	size_t ret_val_size = 0;
+  /* show warnings even if build is successful */
+  size_t ret_val_size = 0;
 
-	clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+  clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
 
-	if(ciErr != CL_SUCCESS) {
-		add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) + ", errors in console.");
-	}
+  if (ciErr != CL_SUCCESS) {
+    add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) +
+              ", errors in console.");
+  }
 
-	if(ret_val_size > 1) {
-		vector<char> build_log(ret_val_size + 1);
-		clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
+  if (ret_val_size > 1) {
+    vector<char> build_log(ret_val_size + 1);
+    clGetProgramBuildInfo(
+        program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
 
-		build_log[ret_val_size] = '\0';
-		/* Skip meaningless empty output from the NVidia compiler. */
-		if(!(ret_val_size == 2 && build_log[0] == '\n')) {
-			add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]), ciErr == CL_SUCCESS);
-		}
-	}
+    build_log[ret_val_size] = '\0';
+    /* Skip meaningless empty output from the NVidia compiler. */
+    if (!(ret_val_size == 2 && build_log[0] == '\n')) {
+      add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]),
+              ciErr == CL_SUCCESS);
+    }
+  }
 
-	return (ciErr == CL_SUCCESS);
+  return (ciErr == CL_SUCCESS);
 }
 
 bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src)
 {
-	string source = get_program_source(kernel_file);
+  string source = get_program_source(kernel_file);
 
-	if(debug_src) {
-		path_write_text(*debug_src, source);
-	}
+  if (debug_src) {
+    path_write_text(*debug_src, source);
+  }
 
-	size_t source_len = source.size();
-	const char *source_str = source.c_str();
-	cl_int ciErr;
+  size_t source_len = source.size();
+  const char *source_str = source.c_str();
+  cl_int ciErr;
 
-	program = clCreateProgramWithSource(device->cxContext,
-	                                    1,
-	                                    &source_str,
-	                                    &source_len,
-	                                    &ciErr);
+  program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr);
 
-	if(ciErr != CL_SUCCESS) {
-		add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
-		return false;
-	}
+  if (ciErr != CL_SUCCESS) {
+    add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
+    return false;
+  }
 
-	double starttime = time_dt();
-	add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-	add_log(string("Build flags: ") + kernel_build_options, true);
+  double starttime = time_dt();
+  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
+  add_log(string("Build flags: ") + kernel_build_options, true);
 
-	if(!build_kernel(debug_src))
-		return false;
+  if (!build_kernel(debug_src))
+    return false;
 
-	double elapsed = time_dt() - starttime;
-	add_log(string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), false);
+  double elapsed = time_dt() - starttime;
+  add_log(
+      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
+      false);
 
-	return true;
+  return true;
 }
 
-static void escape_python_string(string& str)
+static void escape_python_string(string &str)
 {
-	/* Escape string to be passed as a Python raw string with '' quotes'. */
-	string_replace(str, "'", "\'");
+  /* Escape string to be passed as a Python raw string with '' quotes'. */
+  string_replace(str, "'", "\'");
 }
 
-bool OpenCLDevice::OpenCLProgram::compile_separate(const string& clbin)
+bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
 {
-	vector<string> args;
-	args.push_back("--background");
-	args.push_back("--factory-startup");
-	args.push_back("--python-expr");
-
-	int device_platform_id = device->device_num;
-	string device_name = device->device_name;
-	string platform_name = device->platform_name;
-	string build_options = device->kernel_build_options(NULL) + kernel_build_options;
-	string kernel_file_escaped = kernel_file;
-	string clbin_escaped = clbin;
-
-	escape_python_string(device_name);
-	escape_python_string(platform_name);
-	escape_python_string(build_options);
-	escape_python_string(kernel_file_escaped);
-	escape_python_string(clbin_escaped);
-
-	args.push_back(
-		string_printf(
-			"import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
-			device_platform_id,
-			device_name.c_str(),
-			platform_name.c_str(),
-			build_options.c_str(),
-			kernel_file_escaped.c_str(),
-			clbin_escaped.c_str()));
-
-	double starttime = time_dt();
-	add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-	add_log(string("Build flags: ") + kernel_build_options, true);
-	if(!system_call_self(args) || !path_exists(clbin)) {
-		return false;
-	}
-
-	double elapsed = time_dt() - starttime;
-	add_log(string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), false);
-
-	return load_binary(clbin);
+  vector<string> args;
+  args.push_back("--background");
+  args.push_back("--factory-startup");
+  args.push_back("--python-expr");
+
+  int device_platform_id = device->device_num;
+  string device_name = device->device_name;
+  string platform_name = device->platform_name;
+  string build_options = device->kernel_build_options(NULL) + kernel_build_options;
+  string kernel_file_escaped = kernel_file;
+  string clbin_escaped = clbin;
+
+  escape_python_string(device_name);
+  escape_python_string(platform_name);
+  escape_python_string(build_options);
+  escape_python_string(kernel_file_escaped);
+  escape_python_string(clbin_escaped);
+
+  args.push_back(string_printf(
+      "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
+      device_platform_id,
+      device_name.c_str(),
+      platform_name.c_str(),
+      build_options.c_str(),
+      kernel_file_escaped.c_str(),
+      clbin_escaped.c_str()));
+
+  double starttime = time_dt();
+  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
+  add_log(string("Build flags: ") + kernel_build_options, true);
+  if (!system_call_self(args) || !path_exists(clbin)) {
+    return false;
+  }
+
+  double elapsed = time_dt() - starttime;
+  add_log(
+      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
+      false);
+
+  return load_binary(clbin);
 }
 
 /* Compile opencl kernel. This method is called from the _cycles Python
  * module compile kernels. Parameters must match function above. */
-bool device_opencl_compile_kernel(const vector<string>& parameters)
+bool device_opencl_compile_kernel(const vector<string> &parameters)
 {
-	int device_platform_id = std::stoi(parameters[0]);
-	const string& device_name = parameters[1];
-	const string& platform_name = parameters[2];
-	const string& build_options = parameters[3];
-	const string& kernel_file = parameters[4];
-	const string& binary_path = parameters[5];
-
-	if(clewInit() != CLEW_SUCCESS) {
-		return false;
-	}
-
-	vector<OpenCLPlatformDevice> usable_devices;
-	OpenCLInfo::get_usable_devices(&usable_devices);
-	if(device_platform_id >= usable_devices.size()) {
-		return false;
-	}
-
-	OpenCLPlatformDevice& platform_device = usable_devices[device_platform_id];
-	if(platform_device.platform_name != platform_name ||
-	   platform_device.device_name != device_name)
-	{
-		return false;
-	}
-
-	cl_platform_id platform = platform_device.platform_id;
-	cl_device_id device = platform_device.device_id;
-	const cl_context_properties context_props[] = {
-		CL_CONTEXT_PLATFORM, (cl_context_properties) platform,
-		0, 0
-	};
-
-	cl_int err;
-	cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
-	if(err != CL_SUCCESS) {
-		return false;
-	}
-
-	string source = get_program_source(kernel_file);
-	size_t source_len = source.size();
-	const char *source_str = source.c_str();
-	cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
-	bool result = false;
-
-	if(err == CL_SUCCESS) {
-		err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-		if(err == CL_SUCCESS) {
-			size_t size = 0;
-			clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-			if(size > 0) {
-				vector<uint8_t> binary(size);
-				uint8_t *bytes = &binary[0];
-				clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
-				result = path_write_binary(binary_path, binary);
-			}
-		}
-		clReleaseProgram(program);
-	}
-
-	clReleaseContext(context);
-
-	return result;
+  int device_platform_id = std::stoi(parameters[0]);
+  const string &device_name = parameters[1];
+  const string &platform_name = parameters[2];
+  const string &build_options = parameters[3];
+  const string &kernel_file = parameters[4];
+  const string &binary_path = parameters[5];
+
+  if (clewInit() != CLEW_SUCCESS) {
+    return false;
+  }
+
+  vector<OpenCLPlatformDevice> usable_devices;
+  OpenCLInfo::get_usable_devices(&usable_devices);
+  if (device_platform_id >= usable_devices.size()) {
+    return false;
+  }
+
+  OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id];
+  if (platform_device.platform_name != platform_name ||
+      platform_device.device_name != device_name) {
+    return false;
+  }
+
+  cl_platform_id platform = platform_device.platform_id;
+  cl_device_id device = platform_device.device_id;
+  const cl_context_properties context_props[] = {
+      CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0};
+
+  cl_int err;
+  cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
+  if (err != CL_SUCCESS) {
+    return false;
+  }
+
+  string source = get_program_source(kernel_file);
+  size_t source_len = source.size();
+  const char *source_str = source.c_str();
+  cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
+  bool result = false;
+
+  if (err == CL_SUCCESS) {
+    err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
+
+    if (err == CL_SUCCESS) {
+      size_t size = 0;
+      clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+      if (size > 0) {
+        vector<uint8_t> binary(size);
+        uint8_t *bytes = &binary[0];
+        clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
+        result = path_write_binary(binary_path, binary);
+      }
+    }
+    clReleaseProgram(program);
+  }
+
+  clReleaseContext(context);
+
+  return result;
 }
 
-bool OpenCLDevice::OpenCLProgram::load_binary(const string& clbin,
-                                                  const string *debug_src)
+bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src)
 {
-	/* read binary into memory */
-	vector<uint8_t> binary;
+  /* read binary into memory */
+  vector<uint8_t> binary;
 
-	if(!path_read_binary(clbin, binary)) {
-		add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
-		return false;
-	}
+  if (!path_read_binary(clbin, binary)) {
+    add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
+    return false;
+  }
 
-	/* create program */
-	cl_int status, ciErr;
-	size_t size = binary.size();
-	const uint8_t *bytes = &binary[0];
+  /* create program */
+  cl_int status, ciErr;
+  size_t size = binary.size();
+  const uint8_t *bytes = &binary[0];
 
-	program = clCreateProgramWithBinary(device->cxContext, 1, &device->cdDevice,
-		&size, &bytes, &status, &ciErr);
+  program = clCreateProgramWithBinary(
+      device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr);
 
-	if(status != CL_SUCCESS || ciErr != CL_SUCCESS) {
-		add_error(string("OpenCL failed create program from cached binary ") + clbin + ": "
-		                 + clewErrorString(status) + " " + clewErrorString(ciErr));
-		return false;
-	}
+  if (status != CL_SUCCESS || ciErr != CL_SUCCESS) {
+    add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " +
+              clewErrorString(status) + " " + clewErrorString(ciErr));
+    return false;
+  }
 
-	if(!build_kernel(debug_src))
-		return false;
+  if (!build_kernel(debug_src))
+    return false;
 
-	return true;
+  return true;
 }
 
-bool OpenCLDevice::OpenCLProgram::save_binary(const string& clbin)
+bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin)
 {
-	size_t size = 0;
-	clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+  size_t size = 0;
+  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
 
-	if(!size)
-		return false;
+  if (!size)
+    return false;
 
-	vector<uint8_t> binary(size);
-	uint8_t *bytes = &binary[0];
+  vector<uint8_t> binary(size);
+  uint8_t *bytes = &binary[0];
 
-	clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
+  clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
 
-	return path_write_binary(clbin, binary);
+  return path_write_binary(clbin, binary);
 }
 
 bool OpenCLDevice::OpenCLProgram::load()
 {
-	loaded = false;
-	string device_md5 = device->device_md5_hash(kernel_build_options);
-
-	/* Try to use cached kernel. */
-	thread_scoped_lock cache_locker;
-	ustring cache_key(program_name + device_md5);
-	program = device->load_cached_kernel(cache_key,
-	                                     cache_locker);
-	if (!program) {
-		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-		/* need to create source to get md5 */
-		string source = get_program_source(kernel_file);
-
-		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
-		basename = path_cache_get(path_join("kernels", basename));
-		string clbin = basename + ".clbin";
-
-		/* If binary kernel exists already, try use it. */
-		if(path_exists(clbin) && load_binary(clbin)) {
-			/* Kernel loaded from binary, nothing to do. */
-			add_log(string("Loaded program from ") + clbin + ".", true);
-
-			/* Cache the program. */
-			device->store_cached_kernel(program,
-			                            cache_key,
-			                            cache_locker);
-		}
-		else {
-			add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
-			cache_locker.unlock();
-		}
-	}
-
-	if (program) {
-		create_kernels();
-		loaded = true;
-		needs_compiling = false;
-	}
-
-	return loaded;
+  loaded = false;
+  string device_md5 = device->device_md5_hash(kernel_build_options);
+
+  /* Try to use cached kernel. */
+  thread_scoped_lock cache_locker;
+  ustring cache_key(program_name + device_md5);
+  program = device->load_cached_kernel(cache_key, cache_locker);
+  if (!program) {
+    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
+
+    /* need to create source to get md5 */
+    string source = get_program_source(kernel_file);
+
+    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
+                      util_md5_string(source);
+    basename = path_cache_get(path_join("kernels", basename));
+    string clbin = basename + ".clbin";
+
+    /* If binary kernel exists already, try use it. */
+    if (path_exists(clbin) && load_binary(clbin)) {
+      /* Kernel loaded from binary, nothing to do. */
+      add_log(string("Loaded program from ") + clbin + ".", true);
+
+      /* Cache the program. */
+      device->store_cached_kernel(program, cache_key, cache_locker);
+    }
+    else {
+      add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
+      cache_locker.unlock();
+    }
+  }
+
+  if (program) {
+    create_kernels();
+    loaded = true;
+    needs_compiling = false;
+  }
+
+  return loaded;
 }
 
 void OpenCLDevice::OpenCLProgram::compile()
 {
-	assert(device);
-
-	string device_md5 = device->device_md5_hash(kernel_build_options);
-
-	/* Try to use cached kernel. */
-	thread_scoped_lock cache_locker;
-	ustring cache_key(program_name + device_md5);
-	program = device->load_cached_kernel(cache_key,
-	                                     cache_locker);
-
-	if (!program)
-	{
-
-		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-		/* need to create source to get md5 */
-		string source = get_program_source(kernel_file);
-
-		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
-		basename = path_cache_get(path_join("kernels", basename));
-		string clbin = basename + ".clbin";
-
-		/* path to preprocessed source for debugging */
-		string clsrc, *debug_src = NULL;
-
-		if(OpenCLInfo::use_debug()) {
-			clsrc = basename + ".cl";
-			debug_src = &clsrc;
-		}
-
-		/* If binary kernel exists already, try use it. */
-		if(compile_separate(clbin)) {
-			add_log(string("Built and loaded program from ") + clbin + ".", true);
-			loaded = true;
-		}
-		else {
-			add_log(string("Separate-process building of ") + clbin + " failed, will fall back to regular building.", true);
-
-			/* If does not exist or loading binary failed, compile kernel. */
-			if(!compile_kernel(debug_src)) {
-				needs_compiling = false;
-				return;
-			}
-
-			/* Save binary for reuse. */
-			if(!save_binary(clbin)) {
-				add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
-			}
-		}
-
-		/* Cache the program. */
-		device->store_cached_kernel(program,
-									cache_key,
-									cache_locker);
-	}
-
-	create_kernels();
-	needs_compiling = false;
-	loaded = true;
+  assert(device);
+
+  string device_md5 = device->device_md5_hash(kernel_build_options);
+
+  /* Try to use cached kernel. */
+  thread_scoped_lock cache_locker;
+  ustring cache_key(program_name + device_md5);
+  program = device->load_cached_kernel(cache_key, cache_locker);
+
+  if (!program) {
+
+    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
+
+    /* need to create source to get md5 */
+    string source = get_program_source(kernel_file);
+
+    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
+                      util_md5_string(source);
+    basename = path_cache_get(path_join("kernels", basename));
+    string clbin = basename + ".clbin";
+
+    /* path to preprocessed source for debugging */
+    string clsrc, *debug_src = NULL;
+
+    if (OpenCLInfo::use_debug()) {
+      clsrc = basename + ".cl";
+      debug_src = &clsrc;
+    }
+
+    /* If binary kernel exists already, try use it. */
+    if (compile_separate(clbin)) {
+      add_log(string("Built and loaded program from ") + clbin + ".", true);
+      loaded = true;
+    }
+    else {
+      add_log(string("Separate-process building of ") + clbin +
+                  " failed, will fall back to regular building.",
+              true);
+
+      /* If does not exist or loading binary failed, compile kernel. */
+      if (!compile_kernel(debug_src)) {
+        needs_compiling = false;
+        return;
+      }
+
+      /* Save binary for reuse. */
+      if (!save_binary(clbin)) {
+        add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
+      }
+    }
+
+    /* Cache the program. */
+    device->store_cached_kernel(program, cache_key, cache_locker);
+  }
+
+  create_kernels();
+  needs_compiling = false;
+  loaded = true;
 }
 
 void OpenCLDevice::OpenCLProgram::create_kernels()
 {
-	for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) {
-		assert(kernel->second == NULL);
-		cl_int ciErr;
-		string name = "kernel_ocl_" + kernel->first.string();
-		kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
-		if(device->opencl_error(ciErr)) {
-			add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " + clewErrorString(ciErr));
-			return;
-		}
-	}
+  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
+       ++kernel) {
+    assert(kernel->second == NULL);
+    cl_int ciErr;
+    string name = "kernel_ocl_" + kernel->first.string();
+    kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
+    if (device->opencl_error(ciErr)) {
+      add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " +
+                clewErrorString(ciErr));
+      return;
+    }
+  }
 }
 
 bool OpenCLDevice::OpenCLProgram::wait_for_availability()
 {
-	add_log(string("Waiting for availability of ") + program_name + ".", true);
-	while (needs_compiling) {
-		time_sleep(0.1);
-	}
-	return loaded;
+  add_log(string("Waiting for availability of ") + program_name + ".", true);
+  while (needs_compiling) {
+    time_sleep(0.1);
+  }
+  return loaded;
 }
 
 void OpenCLDevice::OpenCLProgram::report_error()
 {
-	/* If loaded is true, there was no error. */
-	if(loaded) return;
-	/* if use_stdout is true, the error was already reported. */
-	if(use_stdout) return;
-
-	cerr << error_msg << endl;
-	if(!compile_output.empty()) {
-		cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
-		cerr << compile_output << endl;
-	}
+  /* If loaded is true, there was no error. */
+  if (loaded)
+    return;
+  /* if use_stdout is true, the error was already reported. */
+  if (use_stdout)
+    return;
+
+  cerr << error_msg << endl;
+  if (!compile_output.empty()) {
+    cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
+    cerr << compile_output << endl;
+  }
 }
 
 cl_kernel OpenCLDevice::OpenCLProgram::operator()()
 {
-	assert(kernels.size() == 1);
-	return kernels.begin()->second;
+  assert(kernels.size() == 1);
+  return kernels.begin()->second;
 }
 
 cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name)
 {
-	assert(kernels.count(name));
-	return kernels[name];
+  assert(kernels.count(name));
+  return kernels[name];
 }
 
 cl_device_type OpenCLInfo::device_type()
 {
-	switch(DebugFlags().opencl.device_type)
-	{
-		case DebugFlags::OpenCL::DEVICE_NONE:
-			return 0;
-		case DebugFlags::OpenCL::DEVICE_ALL:
-			return CL_DEVICE_TYPE_ALL;
-		case DebugFlags::OpenCL::DEVICE_DEFAULT:
-			return CL_DEVICE_TYPE_DEFAULT;
-		case DebugFlags::OpenCL::DEVICE_CPU:
-			return CL_DEVICE_TYPE_CPU;
-		case DebugFlags::OpenCL::DEVICE_GPU:
-			return CL_DEVICE_TYPE_GPU;
-		case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
-			return CL_DEVICE_TYPE_ACCELERATOR;
-		default:
-			return CL_DEVICE_TYPE_ALL;
-	}
+  switch (DebugFlags().opencl.device_type) {
+    case DebugFlags::OpenCL::DEVICE_NONE:
+      return 0;
+    case DebugFlags::OpenCL::DEVICE_ALL:
+      return CL_DEVICE_TYPE_ALL;
+    case DebugFlags::OpenCL::DEVICE_DEFAULT:
+      return CL_DEVICE_TYPE_DEFAULT;
+    case DebugFlags::OpenCL::DEVICE_CPU:
+      return CL_DEVICE_TYPE_CPU;
+    case DebugFlags::OpenCL::DEVICE_GPU:
+      return CL_DEVICE_TYPE_GPU;
+    case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
+      return CL_DEVICE_TYPE_ACCELERATOR;
+    default:
+      return CL_DEVICE_TYPE_ALL;
+  }
 }
 
 bool OpenCLInfo::use_debug()
 {
-	return DebugFlags().opencl.debug;
+  return DebugFlags().opencl.debug;
 }
 
-bool OpenCLInfo::device_supported(const string& platform_name,
-                                  const cl_device_id device_id)
+bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id)
 {
-	cl_device_type device_type;
-	if(!get_device_type(device_id, &device_type)) {
-		return false;
-	}
-	string device_name;
-	if(!get_device_name(device_id, &device_name)) {
-		return false;
-	}
-
-	int driver_major = 0;
-	int driver_minor = 0;
-	if(!get_driver_version(device_id, &driver_major, &driver_minor)) {
-		return false;
-	}
-	VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
-
-	/* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
-	 * (aka, it will not be on Intel framework). This isn't supported
-	 * and needs an explicit blacklist.
-	 */
-	if(strstr(device_name.c_str(), "Iris")) {
-		return false;
-	}
-	if(platform_name == "AMD Accelerated Parallel Processing" &&
-	   device_type == CL_DEVICE_TYPE_GPU)
-	{
-		if(driver_major < 2236) {
-			VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
-			return false;
-		}
-		const char *blacklist[] = {
-			/* GCN 1 */
-			"Tahiti", "Pitcairn", "Capeverde", "Oland", "Hainan",
-			NULL
-		};
-		for(int i = 0; blacklist[i] != NULL; i++) {
-			if(device_name == blacklist[i]) {
-				VLOG(1) << "AMD device " << device_name << " not supported";
-				return false;
-			}
-		}
-		return true;
-	}
-	if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
-		return false;
-	}
-	return false;
+  cl_device_type device_type;
+  if (!get_device_type(device_id, &device_type)) {
+    return false;
+  }
+  string device_name;
+  if (!get_device_name(device_id, &device_name)) {
+    return false;
+  }
+
+  int driver_major = 0;
+  int driver_minor = 0;
+  if (!get_driver_version(device_id, &driver_major, &driver_minor)) {
+    return false;
+  }
+  VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
+
+  /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
+   * (aka, it will not be on Intel framework). This isn't supported
+   * and needs an explicit blacklist.
+   */
+  if (strstr(device_name.c_str(), "Iris")) {
+    return false;
+  }
+  if (platform_name == "AMD Accelerated Parallel Processing" &&
+      device_type == CL_DEVICE_TYPE_GPU) {
+    if (driver_major < 2236) {
+      VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
+      return false;
+    }
+    const char *blacklist[] = {/* GCN 1 */
+                               "Tahiti",
+                               "Pitcairn",
+                               "Capeverde",
+                               "Oland",
+                               "Hainan",
+                               NULL};
+    for (int i = 0; blacklist[i] != NULL; i++) {
+      if (device_name == blacklist[i]) {
+        VLOG(1) << "AMD device " << device_name << " not supported";
+        return false;
+      }
+    }
+    return true;
+  }
+  if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
+    return false;
+  }
+  return false;
 }
 
-bool OpenCLInfo::platform_version_check(cl_platform_id platform,
-                                        string *error)
+bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
 {
-	const int req_major = 1, req_minor = 1;
-	int major, minor;
-	char version[256];
-	clGetPlatformInfo(platform,
-	                  CL_PLATFORM_VERSION,
-	                  sizeof(version),
-	                  &version,
-	                  NULL);
-	if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
-		if(error != NULL) {
-			*error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
-		}
-		return false;
-	}
-	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
-		if(error != NULL) {
-			*error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
-		}
-		return false;
-	}
-	if(error != NULL) {
-		*error = "";
-	}
-	return true;
+  const int req_major = 1, req_minor = 1;
+  int major, minor;
+  char version[256];
+  clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
+  if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
+    if (error != NULL) {
+      *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
+    }
+    return false;
+  }
+  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
+    if (error != NULL) {
+      *error = string_printf(
+          "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
+    }
+    return false;
+  }
+  if (error != NULL) {
+    *error = "";
+  }
+  return true;
 }
 
-bool OpenCLInfo::device_version_check(cl_device_id device,
-                                      string *error)
+bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
 {
-	const int req_major = 1, req_minor = 1;
-	int major, minor;
-	char version[256];
-	clGetDeviceInfo(device,
-	                CL_DEVICE_OPENCL_C_VERSION,
-	                sizeof(version),
-	                &version,
-	                NULL);
-	if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
-		if(error != NULL) {
-			*error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
-		}
-		return false;
-	}
-	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
-		if(error != NULL) {
-			*error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
-		}
-		return false;
-	}
-	if(error != NULL) {
-		*error = "";
-	}
-	return true;
+  const int req_major = 1, req_minor = 1;
+  int major, minor;
+  char version[256];
+  clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
+  if (sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
+    if (error != NULL) {
+      *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
+    }
+    return false;
+  }
+  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
+    if (error != NULL) {
+      *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
+    }
+    return false;
+  }
+  if (error != NULL) {
+    *error = "";
+  }
+  return true;
 }
 
-string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id)
+string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id)
 {
-	if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
-		/* Use cl_amd_device_topology extension. */
-		cl_char topology[24];
-		if(clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS && topology[0] == 1) {
-			return string_printf("%02x:%02x.%01x",
-			                     (unsigned int)topology[21],
-			                     (unsigned int)topology[22],
-			                     (unsigned int)topology[23]);
-		}
-	}
-	else if(platform_name == "NVIDIA CUDA") {
-		/* Use two undocumented options of the cl_nv_device_attribute_query extension. */
-		cl_int bus_id, slot_id;
-		if(clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id,  NULL) == CL_SUCCESS &&
-		   clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
-			return string_printf("%02x:%02x.%01x",
-			                     (unsigned int)(bus_id),
-			                     (unsigned int)(slot_id >> 3),
-			                     (unsigned int)(slot_id & 0x7));
-		}
-	}
-	/* No general way to get a hardware ID from OpenCL => give up. */
-	return "";
+  if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
+    /* Use cl_amd_device_topology extension. */
+    cl_char topology[24];
+    if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS &&
+        topology[0] == 1) {
+      return string_printf("%02x:%02x.%01x",
+                           (unsigned int)topology[21],
+                           (unsigned int)topology[22],
+                           (unsigned int)topology[23]);
+    }
+  }
+  else if (platform_name == "NVIDIA CUDA") {
+    /* Use two undocumented options of the cl_nv_device_attribute_query extension. */
+    cl_int bus_id, slot_id;
+    if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS &&
+        clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
+      return string_printf("%02x:%02x.%01x",
+                           (unsigned int)(bus_id),
+                           (unsigned int)(slot_id >> 3),
+                           (unsigned int)(slot_id & 0x7));
+    }
+  }
+  /* No general way to get a hardware ID from OpenCL => give up. */
+  return "";
 }
 
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
-                                    bool force_all)
+void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all)
 {
-	const cl_device_type device_type = OpenCLInfo::device_type();
-	static bool first_time = true;
-#define FIRST_VLOG(severity) if(first_time) VLOG(severity)
-
-	usable_devices->clear();
-
-	if(device_type == 0) {
-		FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
-		first_time = false;
-		return;
-	}
-
-	cl_int error;
-	vector<cl_device_id> device_ids;
-	vector<cl_platform_id> platform_ids;
-
-	/* Get platforms. */
-	if(!get_platforms(&platform_ids, &error)) {
-		FIRST_VLOG(2) << "Error fetching platforms:"
-		              << string(clewErrorString(error));
-		first_time = false;
-		return;
-	}
-	if(platform_ids.size() == 0) {
-		FIRST_VLOG(2) << "No OpenCL platforms were found.";
-		first_time = false;
-		return;
-	}
-	/* Devices are numbered consecutively across platforms. */
-	for(int platform = 0; platform < platform_ids.size(); platform++) {
-		cl_platform_id platform_id = platform_ids[platform];
-		string platform_name;
-		if(!get_platform_name(platform_id, &platform_name)) {
-			FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
-			continue;
-		}
-		FIRST_VLOG(2) << "Enumerating devices for platform "
-		              << platform_name << ".";
-		if(!platform_version_check(platform_id)) {
-			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << " due to too old compiler version.";
-			continue;
-		}
-		if(!get_platform_devices(platform_id,
-		                         device_type,
-		                         &device_ids,
-		                         &error))
-		{
-			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch of devices: "
-			              << string(clewErrorString(error));
-			continue;
-		}
-		if(device_ids.size() == 0) {
-			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", it has no devices.";
-			continue;
-		}
-		for(int num = 0; num < device_ids.size(); num++) {
-			const cl_device_id device_id = device_ids[num];
-			string device_name;
-			if(!get_device_name(device_id, &device_name, &error)) {
-				FIRST_VLOG(2) << "Failed to fetch device name: "
-				              << string(clewErrorString(error))
-				              << ", ignoring.";
-				continue;
-			}
-			if(!device_version_check(device_id)) {
-				FIRST_VLOG(2) << "Ignoring device " << device_name
-				              << " due to old compiler version.";
-				continue;
-			}
-			if(force_all ||
-			   device_supported(platform_name, device_id))
-			{
-				cl_device_type device_type;
-				if(!get_device_type(device_id, &device_type, &error)) {
-					FIRST_VLOG(2) << "Ignoring device " << device_name
-					              << ", failed to fetch device type:"
-					              << string(clewErrorString(error));
-					continue;
-				}
-				string readable_device_name =
-				        get_readable_device_name(device_id);
-				if(readable_device_name != device_name) {
-					FIRST_VLOG(2) << "Using more readable device name: "
-					              << readable_device_name;
-				}
-				FIRST_VLOG(2) << "Adding new device "
-				              << readable_device_name << ".";
-				string hardware_id = get_hardware_id(platform_name, device_id);
-				string device_extensions = get_device_extensions(device_id);
-				usable_devices->push_back(OpenCLPlatformDevice(
-				        platform_id,
-				        platform_name,
-				        device_id,
-				        device_type,
-				        readable_device_name,
-				        hardware_id,
-				        device_extensions));
-			}
-			else {
-				FIRST_VLOG(2) << "Ignoring device " << device_name
-				              << ", not officially supported yet.";
-			}
-		}
-	}
-	first_time = false;
+  const cl_device_type device_type = OpenCLInfo::device_type();
+  static bool first_time = true;
+#  define FIRST_VLOG(severity) \
+    if (first_time) \
+    VLOG(severity)
+
+  usable_devices->clear();
+
+  if (device_type == 0) {
+    FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
+    first_time = false;
+    return;
+  }
+
+  cl_int error;
+  vector<cl_device_id> device_ids;
+  vector<cl_platform_id> platform_ids;
+
+  /* Get platforms. */
+  if (!get_platforms(&platform_ids, &error)) {
+    FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error));
+    first_time = false;
+    return;
+  }
+  if (platform_ids.size() == 0) {
+    FIRST_VLOG(2) << "No OpenCL platforms were found.";
+    first_time = false;
+    return;
+  }
+  /* Devices are numbered consecutively across platforms. */
+  for (int platform = 0; platform < platform_ids.size(); platform++) {
+    cl_platform_id platform_id = platform_ids[platform];
+    string platform_name;
+    if (!get_platform_name(platform_id, &platform_name)) {
+      FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
+      continue;
+    }
+    FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << ".";
+    if (!platform_version_check(platform_id)) {
+      FIRST_VLOG(2) << "Ignoring platform " << platform_name
+                    << " due to too old compiler version.";
+      continue;
+    }
+    if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) {
+      FIRST_VLOG(2) << "Ignoring platform " << platform_name
+                    << ", failed to fetch of devices: " << string(clewErrorString(error));
+      continue;
+    }
+    if (device_ids.size() == 0) {
+      FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices.";
+      continue;
+    }
+    for (int num = 0; num < device_ids.size(); num++) {
+      const cl_device_id device_id = device_ids[num];
+      string device_name;
+      if (!get_device_name(device_id, &device_name, &error)) {
+        FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error))
+                      << ", ignoring.";
+        continue;
+      }
+      if (!device_version_check(device_id)) {
+        FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
+        continue;
+      }
+      if (force_all || device_supported(platform_name, device_id)) {
+        cl_device_type device_type;
+        if (!get_device_type(device_id, &device_type, &error)) {
+          FIRST_VLOG(2) << "Ignoring device " << device_name
+                        << ", failed to fetch device type:" << string(clewErrorString(error));
+          continue;
+        }
+        string readable_device_name = get_readable_device_name(device_id);
+        if (readable_device_name != device_name) {
+          FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name;
+        }
+        FIRST_VLOG(2) << "Adding new device " << readable_device_name << ".";
+        string hardware_id = get_hardware_id(platform_name, device_id);
+        string device_extensions = get_device_extensions(device_id);
+        usable_devices->push_back(OpenCLPlatformDevice(platform_id,
+                                                       platform_name,
+                                                       device_id,
+                                                       device_type,
+                                                       readable_device_name,
+                                                       hardware_id,
+                                                       device_extensions));
+      }
+      else {
+        FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet.";
+      }
+    }
+  }
+  first_time = false;
 }
 
-bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids,
-                               cl_int *error)
+bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error)
 {
-	/* Reset from possible previous state. */
-	platform_ids->resize(0);
-	cl_uint num_platforms;
-	if(!get_num_platforms(&num_platforms, error)) {
-		return false;
-	}
-	/* Get actual platforms. */
-	cl_int err;
-	platform_ids->resize(num_platforms);
-	if((err = clGetPlatformIDs(num_platforms,
-	                           &platform_ids->at(0),
-	                           NULL)) != CL_SUCCESS) {
-		if(error != NULL) {
-			*error = err;
-		}
-		return false;
-	}
-	if(error != NULL) {
-		*error = CL_SUCCESS;
-	}
-	return true;
+  /* Reset from possible previous state. */
+  platform_ids->resize(0);
+  cl_uint num_platforms;
+  if (!get_num_platforms(&num_platforms, error)) {
+    return false;
+  }
+  /* Get actual platforms. */
+  cl_int err;
+  platform_ids->resize(num_platforms);
+  if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) {
+    if (error != NULL) {
+      *error = err;
+    }
+    return false;
+  }
+  if (error != NULL) {
+    *error = CL_SUCCESS;
+  }
+  return true;
 }
 
 vector<cl_platform_id> OpenCLInfo::get_platforms()
 {
-	vector<cl_platform_id> platform_ids;
-	get_platforms(&platform_ids);
-	return platform_ids;
+  vector<cl_platform_id> platform_ids;
+  get_platforms(&platform_ids);
+  return platform_ids;
 }
 
 bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
 {
-	cl_int err;
-	if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
-		if(error != NULL) {
-			*error = err;
-		}
-		*num_platforms = 0;
-		return false;
-	}
-	if(error != NULL) {
-		*error = CL_SUCCESS;
-	}
-	return true;
+  cl_int err;
+  if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
+    if (error != NULL) {
+      *error = err;
+    }
+    *num_platforms = 0;
+    return false;
+  }
+  if (error != NULL) {
+    *error = CL_SUCCESS;
+  }
+  return true;
 }
 
 cl_uint OpenCLInfo::get_num_platforms()
 {
-	cl_uint num_platforms;
-	if(!get_num_platforms(&num_platforms)) {
-		return 0;
-	}
-	return num_platforms;
+  cl_uint num_platforms;
+  if (!get_num_platforms(&num_platforms)) {
+    return 0;
+  }
+  return num_platforms;
 }
 
-bool OpenCLInfo::get_platform_name(cl_platform_id platform_id,
-                                   string *platform_name)
+bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name)
 {
-	char buffer[256];
-	if(clGetPlatformInfo(platform_id,
-	                     CL_PLATFORM_NAME,
-	                     sizeof(buffer),
-	                     &buffer,
-	                     NULL) != CL_SUCCESS)
-	{
-		*platform_name = "";
-		return false;
-	}
-	*platform_name = buffer;
-	return true;
+  char buffer[256];
+  if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) !=
+      CL_SUCCESS) {
+    *platform_name = "";
+    return false;
+  }
+  *platform_name = buffer;
+  return true;
 }
 
 string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
 {
-	string platform_name;
-	if(!get_platform_name(platform_id, &platform_name)) {
-		return "";
-	}
-	return platform_name;
+  string platform_name;
+  if (!get_platform_name(platform_id, &platform_name)) {
+    return "";
+  }
+  return platform_name;
 }
 
 bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
@@ -1076,266 +1034,222 @@ bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
                                           cl_uint *num_devices,
                                           cl_int *error)
 {
-	cl_int err;
-	if((err = clGetDeviceIDs(platform_id,
-	                         device_type,
-	                         0,
-	                         NULL,
-	                         num_devices)) != CL_SUCCESS)
-	{
-		if(error != NULL) {
-			*error = err;
-		}
-		*num_devices = 0;
-		return false;
-	}
-	if(error != NULL) {
-		*error = CL_SUCCESS;
-	}
-	return true;
+  cl_int err;
+  if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) {
+    if (error != NULL) {
+      *error = err;
+    }
+    *num_devices = 0;
+    return false;
+  }
+  if (error != NULL) {
+    *error = CL_SUCCESS;
+  }
+  return true;
 }
 
 cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
                                              cl_device_type device_type)
 {
-	cl_uint num_devices;
-	if(!get_num_platform_devices(platform_id,
-	                             device_type,
-	                             &num_devices))
-	{
-		return 0;
-	}
-	return num_devices;
+  cl_uint num_devices;
+  if (!get_num_platform_devices(platform_id, device_type, &num_devices)) {
+    return 0;
+  }
+  return num_devices;
 }
 
 bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
                                       cl_device_type device_type,
                                       vector<cl_device_id> *device_ids,
-                                      cl_int* error)
+                                      cl_int *error)
 {
-	/* Reset from possible previous state. */
-	device_ids->resize(0);
-	/* Get number of devices to pre-allocate memory. */
-	cl_uint num_devices;
-	if(!get_num_platform_devices(platform_id,
-	                             device_type,
-	                             &num_devices,
-	                             error))
-	{
-		return false;
-	}
-	/* Get actual device list. */
-	device_ids->resize(num_devices);
-	cl_int err;
-	if((err = clGetDeviceIDs(platform_id,
-	                         device_type,
-	                         num_devices,
-	                         &device_ids->at(0),
-	                         NULL)) != CL_SUCCESS)
-	{
-		if(error != NULL) {
-			*error = err;
-		}
-		return false;
-	}
-	if(error != NULL) {
-		*error = CL_SUCCESS;
-	}
-	return true;
+  /* Reset from possible previous state. */
+  device_ids->resize(0);
+  /* Get number of devices to pre-allocate memory. */
+  cl_uint num_devices;
+  if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) {
+    return false;
+  }
+  /* Get actual device list. */
+  device_ids->resize(num_devices);
+  cl_int err;
+  if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) !=
+      CL_SUCCESS) {
+    if (error != NULL) {
+      *error = err;
+    }
+    return false;
+  }
+  if (error != NULL) {
+    *error = CL_SUCCESS;
+  }
+  return true;
 }
 
 vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
                                                       cl_device_type device_type)
 {
-	vector<cl_device_id> devices;
-	get_platform_devices(platform_id, device_type, &devices);
-	return devices;
+  vector<cl_device_id> devices;
+  get_platform_devices(platform_id, device_type, &devices);
+  return devices;
 }
 
-bool OpenCLInfo::get_device_name(cl_device_id device_id,
-                                 string *device_name,
-                                 cl_int* error)
+bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error)
 {
-	char buffer[1024];
-	cl_int err;
-	if((err = clGetDeviceInfo(device_id,
-	                          CL_DEVICE_NAME,
-	                          sizeof(buffer),
-	                          &buffer,
-	                          NULL)) != CL_SUCCESS)
-	{
-		if(error != NULL) {
-			*error = err;
-		}
-		*device_name = "";
-		return false;
-	}
-	if(error != NULL) {
-		*error = CL_SUCCESS;
-	}
-	*device_name = buffer;
-	return true;
+  char buffer[1024];
+  cl_int err;
+  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) !=
+      CL_SUCCESS) {
+    if (error != NULL) {
+      *error = err;
+    }
+    *device_name = "";
+    return false;
+  }
+  if (error != NULL) {
+    *error = CL_SUCCESS;
+  }
+  *device_name = buffer;
+  return true;
 }
 
 string OpenCLInfo::get_device_name(cl_device_id device_id)
 {
-	string device_name;
-	if(!get_device_name(device_id, &device_name)) {
-		return "";
-	}
-	return device_name;
+  string device_name;
+  if (!get_device_name(device_id, &device_name)) {
+    return "";
+  }
+  return device_name;
 }
 
 bool OpenCLInfo::get_device_extensions(cl_device_id device_id,
-	string *device_extensions,
-	cl_int* error)
+                                       string *device_extensions,
+                                       cl_int *error)
 {
-	char buffer[1024];
-	cl_int err;
-	if((err = clGetDeviceInfo(device_id,
-		CL_DEVICE_EXTENSIONS,
-		sizeof(buffer),
-		&buffer,
-		NULL)) != CL_SUCCESS)
-	{
-		if(error != NULL) {
-			*error = err;
-		}
-		*device_extensions = "";
-		return false;
-	}
-	if(error != NULL) {
-		*error = CL_SUCCESS;
-	}
-	*device_extensions = buffer;
-	return true;
+  char buffer[1024];
+  cl_int err;
+  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(buffer), &buffer, NULL)) !=
+      CL_SUCCESS) {
+    if (error != NULL) {
+      *error = err;
+    }
+    *device_extensions = "";
+    return false;
+  }
+  if (error != NULL) {
+    *error = CL_SUCCESS;
+  }
+  *device_extensions = buffer;
+  return true;
 }
 
 string OpenCLInfo::get_device_extensions(cl_device_id device_id)
 {
-	string device_extensions;
-	if(!get_device_extensions(device_id, &device_extensions)) {
-		return "";
-	}
-	return device_extensions;
+  string device_extensions;
+  if (!get_device_extensions(device_id, &device_extensions)) {
+    return "";
+  }
+  return device_extensions;
 }
 
 bool OpenCLInfo::get_device_type(cl_device_id device_id,
                                  cl_device_type *device_type,
-                                 cl_int* error)
+                                 cl_int *error)
 {
-	cl_int err;
-	if((err = clGetDeviceInfo(device_id,
-	                          CL_DEVICE_TYPE,
-	                          sizeof(cl_device_type),
-	                          device_type,
-	                          NULL)) != CL_SUCCESS)
-	{
-		if(error != NULL) {
-			*error = err;
-		}
-		*device_type = 0;
-		return false;
-	}
-	if(error != NULL) {
-		*error = CL_SUCCESS;
-	}
-	return true;
+  cl_int err;
+  if ((err = clGetDeviceInfo(
+           device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) {
+    if (error != NULL) {
+      *error = err;
+    }
+    *device_type = 0;
+    return false;
+  }
+  if (error != NULL) {
+    *error = CL_SUCCESS;
+  }
+  return true;
 }
 
 cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
 {
-	cl_device_type device_type;
-	if(!get_device_type(device_id, &device_type)) {
-		return 0;
-	}
-	return device_type;
+  cl_device_type device_type;
+  if (!get_device_type(device_id, &device_type)) {
+    return 0;
+  }
+  return device_type;
 }
 
 string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
 {
-	string name = "";
-	char board_name[1024];
-	size_t length = 0;
-	if(clGetDeviceInfo(device_id,
-	                   CL_DEVICE_BOARD_NAME_AMD,
-	                   sizeof(board_name),
-	                   &board_name,
-	                   &length) == CL_SUCCESS)
-	{
-		if(length != 0 && board_name[0] != '\0') {
-			name = board_name;
-		}
-	}
-
-	/* Fallback to standard device name API. */
-	if(name.empty()) {
-		name = get_device_name(device_id);
-	}
-
-	/* Special exception for AMD Vega, need to be able to tell
-	 * Vega 56 from 64 apart.
-	 */
-	if(name == "Radeon RX Vega") {
-		cl_int max_compute_units = 0;
-		if(clGetDeviceInfo(device_id,
-		                   CL_DEVICE_MAX_COMPUTE_UNITS,
-		                   sizeof(max_compute_units),
-		                   &max_compute_units,
-		                   NULL) == CL_SUCCESS)
-		{
-			name += " " + to_string(max_compute_units);
-		}
-	}
-
-	/* Distinguish from our native CPU device. */
-	if(get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
-		name += " (OpenCL)";
-	}
-
-	return name;
+  string name = "";
+  char board_name[1024];
+  size_t length = 0;
+  if (clGetDeviceInfo(
+          device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) ==
+      CL_SUCCESS) {
+    if (length != 0 && board_name[0] != '\0') {
+      name = board_name;
+    }
+  }
+
+  /* Fallback to standard device name API. */
+  if (name.empty()) {
+    name = get_device_name(device_id);
+  }
+
+  /* Special exception for AMD Vega, need to be able to tell
+   * Vega 56 from 64 apart.
+   */
+  if (name == "Radeon RX Vega") {
+    cl_int max_compute_units = 0;
+    if (clGetDeviceInfo(device_id,
+                        CL_DEVICE_MAX_COMPUTE_UNITS,
+                        sizeof(max_compute_units),
+                        &max_compute_units,
+                        NULL) == CL_SUCCESS) {
+      name += " " + to_string(max_compute_units);
+    }
+  }
+
+  /* Distinguish from our native CPU device. */
+  if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
+    name += " (OpenCL)";
+  }
+
+  return name;
 }
 
-bool OpenCLInfo::get_driver_version(cl_device_id device_id,
-                                    int *major,
-                                    int *minor,
-                                    cl_int* error)
+bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error)
 {
-	char buffer[1024];
-	cl_int err;
-	if((err = clGetDeviceInfo(device_id,
-	                          CL_DRIVER_VERSION,
-	                          sizeof(buffer),
-	                          &buffer,
-	                          NULL)) != CL_SUCCESS)
-	{
-		if(error != NULL) {
-			*error = err;
-		}
-		return false;
-	}
-	if(error != NULL) {
-		*error = CL_SUCCESS;
-	}
-	if(sscanf(buffer, "%d.%d", major, minor) < 2) {
-		VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
-		return false;
-	}
-	return true;
+  char buffer[1024];
+  cl_int err;
+  if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) !=
+      CL_SUCCESS) {
+    if (error != NULL) {
+      *error = err;
+    }
+    return false;
+  }
+  if (error != NULL) {
+    *error = CL_SUCCESS;
+  }
+  if (sscanf(buffer, "%d.%d", major, minor) < 2) {
+    VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
+    return false;
+  }
+  return true;
 }
 
 int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id)
 {
-	int base_align_bits;
-	if(clGetDeviceInfo(device_id,
-	                   CL_DEVICE_MEM_BASE_ADDR_ALIGN,
-	                   sizeof(int),
-	                   &base_align_bits,
-	                   NULL) == CL_SUCCESS)
-	{
-		return base_align_bits/8;
-	}
-	return 1;
+  int base_align_bits;
+  if (clGetDeviceInfo(
+          device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) ==
+      CL_SUCCESS) {
+    return base_align_bits / 8;
+  }
+  return 1;
 }
 
 CCL_NAMESPACE_END